In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load train data
train_data = pd.read_csv('train_data.txt', delimiter=' ::: ', header=None, names=['ID', 'Title', 'Genre', 'Description'], engine='python')
train_data.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [3]:
# Load test solution data (which contains the true genres)
test_solution = pd.read_csv('test_data_solution.txt', delimiter=' ::: ', header=None, names=['ID', 'Title', 'Genre', 'Description'], engine='python')
test_solution.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...


In [4]:
# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data descriptions
X_train_tfidf = tfidf.fit_transform(train_data['Description'])

# Transform the test data descriptions
X_test_tfidf = tfidf.transform(test_solution['Description'])

In [5]:
# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')

# Fit the model on the training data
model.fit(X_train_tfidf, train_data['Genre'])

In [6]:
# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Get the true labels from the test solution
y_true = test_solution['Genre']

In [7]:
# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Print classification report
report = classification_report(y_true, y_pred, zero_division=0)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.5838007380073801
Classification Report:
              precision    recall  f1-score   support

      action       0.48      0.29      0.36      1314
       adult       0.60      0.24      0.34       590
   adventure       0.58      0.17      0.26       775
   animation       0.52      0.07      0.12       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.58      0.55      7446
       crime       0.36      0.04      0.07       505
 documentary       0.67      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.49      0.09      0.15       783
     fantasy       0.56      0.06      0.10       322
   game-show       0.92      0.51      0.65       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.57      0.61      2204
       music       0.67      0.45      0.54       731
     musical       0.33      0.02      0.04       276
     mystery       0.36      