In [15]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [17]:
# Load data
df = read_data_safely('train_data.txt')
print(f"Loaded {len(df)} movies")

# Clean text
def clean(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['plot'].apply(clean)

Success with encoding: utf-8
Loaded 54214 movies


In [19]:
def read_data_safely(path):
    for encoding in ['utf-8', 'latin-1', 'windows-1252']:
        try:
            with open(path, 'r', encoding=encoding) as f:
                lines = f.readlines()
            print(f"Success with encoding: {encoding}")
            break
        except UnicodeDecodeError:
            continue
    else:
        raise ValueError("Could not read the file with any encoding")

    movies = []
    for line in lines:
        parts = line.strip().split(' ::: ')
        if len(parts) == 4:
            _, title, genre, plot = parts
            movies.append({'title': title, 'genre': genre.lower(), 'plot': plot})
    return pd.DataFrame(movies)

In [21]:
#encode labels
le = LabelEncoder()
y = le.fit_transform(df['genre'])



X_train, X_test, y_train, y_test = train_test_split(
    df['text'], y, test_size=0.2, random_state=42, stratify=y
)

print("Training Logistic Regression + TF-IDF (this is the best model for this task)...")




model = make_pipeline(
    TfidfVectorizer(
        max_features=15000,
        ngram_range=(1, 3),          # unigrams + bigrams + trigrams
        stop_words='english',
        sublinear_tf=True,
        lowercase=True
    ),
    LogisticRegression(
        C=2.0,
        class_weight='balanced',     # helps with drama/comedy imbalance
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    )
)

#training
model.fit(X_train, y_train)

#evaluation
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("\n" + "="*70)
print(f"LOGISTIC REGRESSION ACCURACY: {acc:.4f} → {acc*100:.2f}%")
print("="*70)

print(classification_report(y_test, y_pred, target_names=le.classes_, digits=4))

Training Logistic Regression + TF-IDF (this is the best model for this task)...

LOGISTIC REGRESSION ACCURACY: 0.5191 → 51.91%
              precision    recall  f1-score   support

      action     0.3374    0.5247    0.4107       263
       adult     0.4531    0.7373    0.5613       118
   adventure     0.2245    0.3548    0.2750       155
   animation     0.1812    0.2500    0.2101       100
   biography     0.0290    0.0377    0.0328        53
      comedy     0.6003    0.5181    0.5562      1490
       crime     0.1264    0.2277    0.1625       101
 documentary     0.7895    0.6415    0.7078      2619
       drama     0.7025    0.4414    0.5422      2723
      family     0.1901    0.3185    0.2381       157
     fantasy     0.1235    0.1538    0.1370        65
   game-show     0.7714    0.6923    0.7297        39
     history     0.1226    0.2653    0.1677        49
      horror     0.5859    0.6961    0.6363       441
       music     0.4098    0.7466    0.5291       146
     mus

#### Current model is only really reliable for horror and documentary. For everything else, it’s barely better than guessing or always saying “drama”.
#### 51.91% with classic Logistic Regression/TF-IDF on 27 genres is actually completely normal, it’s just that plot text alone is not enough to reliably distinguish 27 fine-grained genres.
#### Macro-average F1 = 0.379 → if all genres were equally important, the model would be terrible.
#### Weighted-average F1 = 0.537 → pulled up by the few genres that work well (horror, documentary, comedy).