In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

train_data = pd.read_csv("train_data.txt", sep=":::", engine="python", names=["ID", "TITLE", "GENRE", "DESCRIPTION"])
test_data = pd.read_csv("test_data.txt", sep=":::", engine="python", names=["ID", "TITLE", "DESCRIPTION"])

X = train_data["DESCRIPTION"].fillna("")
y = train_data["GENRE"].str.strip()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(test_data["DESCRIPTION"].fillna(""))

unique_classes = y_train.unique()
class_weights = compute_class_weight(class_weight="balanced", classes=unique_classes, y=y_train)
class_weights_dict = {cls: weight for cls, weight in zip(unique_classes, class_weights)}

lr = LogisticRegression(max_iter=5000, class_weight=class_weights_dict)
param_grid = {"C": [0.01, 0.1, 1, 10], "solver": ["lbfgs", "liblinear"]}

grid_search = GridSearchCV(lr, param_grid, scoring="accuracy", cv=3, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

best_lr = grid_search.best_estimator_

y_val_pred = best_lr.predict(X_val_tfidf)
accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred, target_names=unique_classes)

print(f"Validation Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

test_data["PREDICTED_GENRE"] = best_lr.predict(X_test_tfidf)
test_data[["ID", "TITLE", "PREDICTED_GENRE"]].to_csv("predicted_genres.csv", index=False)
print("Predictions saved to 'predicted_genres.csv'.")


Validation Accuracy: 0.55
Classification Report:
              precision    recall  f1-score   support

       drama       0.33      0.46      0.38       263
      comedy       0.44      0.56      0.49       118
   biography       0.25      0.29      0.27       155
     romance       0.19      0.23      0.21       100
 documentary       0.00      0.00      0.00        53
        news       0.57      0.56      0.56      1490
       short       0.16      0.17      0.16       101
      horror       0.75      0.74      0.75      2619
      family       0.65      0.54      0.59      2723
    thriller       0.22      0.27      0.24       157
     western       0.08      0.08      0.08        65
       adult       0.75      0.62      0.68        39
     musical       0.17      0.14      0.16        49
   game-show       0.59      0.69      0.64       441
       crime       0.51      0.63      0.56       146
   talk-show       0.19      0.16      0.17        55
     fantasy       0.06      0.0