In [1]:
import pandas as pd

# Load the datasets
train_path = "train_data.xlsx"
test_path = "test_data.xlsx"
test_sol_path = "test_data_sol.xlsx"

train_df = pd.read_excel(train_path)
test_df = pd.read_excel(test_path)
test_sol_df = pd.read_excel(test_sol_path)

# Display the first few rows of each dataset
train_head = train_df.head()
test_head = test_df.head()
test_sol_head = test_sol_df.head()

train_head, test_head, test_sol_head


(   ID                             TITLE     GENRE  \
 0   1      Oscar et la dame rose (2009)     drama   
 1   2                      Cupid (1997)  thriller   
 2   3  Young, Wild and Wonderful (1980)     adult   
 3   4             The Secret Sin (1915)     drama   
 4   5            The Unrecovered (2007)     drama   
 
                                          DESCRIPTION  
 0  Listening in to a conversation between his doc...  
 1  A brother and sister with a past incestuous re...  
 2  As the bus empties the students for their fiel...  
 3  To help their unemployed father make ends meet...  
 4  The film's title refers not only to the un-rec...  ,
    ID                        TITLE  \
 0   1         Edgar's Lunch (1998)   
 1   2     La guerra de papá (1977)   
 2   3  Off the Beaten Track (2010)   
 3   4       Meu Amigo Hindu (2015)   
 4   5            Er nu zhai (1955)   
 
                                          DESCRIPTION  
 0  L.R. Brane loves his life - his car, his 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Prepare the data
X_train_text = train_df['DESCRIPTION']
y_train = train_df['GENRE']
X_test_text = test_df['DESCRIPTION']
y_test = test_sol_df['GENRE']  # True labels




In [9]:
X_train_text = train_df['DESCRIPTION'].fillna("")
X_test_text = test_df['DESCRIPTION'].fillna("")


In [10]:
# Encode genre labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train_text)
X_test_vec = vectorizer.transform(X_test_text)





In [11]:
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train_encoded)

# Predict genres for test data
y_pred_encoded = clf.predict(X_test_vec)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

report_df = pd.DataFrame(report).transpose()
report_df = report_df.round(3)  # optional: round for readability


print(accuracy)
# Print the full table
print(report_df)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.5769372693726937
              precision  recall  f1-score    support
action            0.492   0.266     0.345   1314.000
adult             0.612   0.222     0.326    590.000
adventure         0.640   0.163     0.259    775.000
animation         0.491   0.056     0.101    498.000
biography         0.000   0.000     0.000    264.000
comedy            0.525   0.561     0.542   7446.000
crime             0.350   0.028     0.051    505.000
documentary       0.655   0.855     0.742  13096.000
drama             0.532   0.780     0.632  13612.000
family            0.512   0.079     0.137    783.000
fantasy           0.636   0.043     0.081    322.000
game-show         0.905   0.492     0.638    193.000
history           0.000   0.000     0.000    243.000
horror            0.641   0.568     0.603   2204.000
music             0.678   0.438     0.532    731.000
musical           0.300   0.011     0.021    276.000
mystery           0.429   0.009     0.018    318.000
news              0.667   0

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Drop rows with missing descriptions in training and test sets
train_df_clean = train_df.dropna(subset=["DESCRIPTION", "GENRE"])
test_df_clean = test_df.dropna(subset=["DESCRIPTION"])
test_solution_df_clean = test_sol_df.loc[test_df_clean.index]

# Re-extract cleaned data
X_train_texts = train_df_clean["DESCRIPTION"]
y_train_labels = train_df_clean["GENRE"]

X_test_texts = test_df_clean["DESCRIPTION"]
y_test_true = test_solution_df_clean["GENRE"]

# Convert genres to numerical labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_labels)
y_test = label_encoder.transform(y_test_true)

# Convert text descriptions into TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

# Train the SVM model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm_model.predict(X_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Use output_dict=True to get a structured dictionary
report_dict = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)

# Convert the dictionary to a DataFrame
report_df = pd.DataFrame(report_dict).transpose()

# Optional: round for better readability
report_df = report_df.round(3)

# Print accuracy and classification report DataFrame
print("Accuracy:", accuracy_score(y_test, y_pred))
print(report_df)



Accuracy: 0.5657380073800738
              precision  recall  f1-score    support
action            0.379   0.301     0.336   1314.000
adult             0.522   0.368     0.431    590.000
adventure         0.375   0.192     0.254    775.000
animation         0.306   0.143     0.195    498.000
biography         0.083   0.008     0.014    264.000
comedy            0.522   0.546     0.534   7446.000
crime             0.175   0.059     0.089    505.000
documentary       0.685   0.814     0.744  13096.000
drama             0.558   0.706     0.623  13612.000
family            0.328   0.134     0.190    783.000
fantasy           0.280   0.093     0.140    322.000
game-show         0.789   0.601     0.682    193.000
history           0.129   0.016     0.029    243.000
horror            0.583   0.588     0.586   2204.000
music             0.589   0.491     0.536    731.000
musical           0.250   0.058     0.094    276.000
mystery           0.172   0.050     0.078    318.000
news             