In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
path = "imdb_movies.csv"

In [2]:
def load_data(path):
    data = pd.read_csv(path)
    unlabeled_data = data[data["genre"].isna()]
    labelled_data = data.drop(index= unlabeled_data.index)
    return labelled_data,unlabeled_data
labelled_data,unlabeled_data = load_data(path)

In [3]:
def label_preprocess(lst):
    out = []
    for i in lst:
        i = i.lower()
        i = i.replace(u'\u00A0',u'')
        out.append(i)
    return out
labelled_data["genre"] = labelled_data["genre"].str.split(",")
labelled_data["genre"] = labelled_data["genre"].apply(label_preprocess)

In [5]:
def preprocess_text(x):
    x = re.sub(r"[^\w\s]"," ",x)
    x = x.lower()
    words = word_tokenize(x)
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(word) for word in words]
    return " ".join(words)
labelled_data["overview"] = labelled_data["overview"].apply(preprocess_text)

In [None]:
x = labelled_data["overview"]
y = labelled_data["genre"]

In [None]:
tfid = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=50000
)

x_transformed = tfid.fit_transform(x)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_transformed,y,test_size=0.3,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

In [None]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)

lr = LogisticRegression(C = 1,n_jobs=-1,max_iter=1000,class_weight="balanced")


In [None]:


ovr = OneVsRestClassifier(lr)
ovr.fit(x_train,y_train)
y_pred = ovr.predict(x_test)

In [None]:
from sklearn.metrics import f1_score
print("Micro-F1:", f1_score(y_test, y_pred, average="micro"))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

In [None]:
y_proba = ovr.predict_proba(x_test)

In [None]:
from sklearn.metrics import precision_recall_curve

for i, genre in enumerate(mlb.classes_):
    precision, recall, thresholds = precision_recall_curve(y_test[:, i], y_proba[:, i])
    best_threshold = thresholds[(precision * recall).argmax()]

In [None]:
best_threshold

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score

# y_test: shape (n_samples, n_labels)    # multi-label binary matrix
# y_proba: shape (n_samples, n_labels)   # per-label probabilities from predict_proba
n_labels = y_test.shape[1]
best_thresholds = np.empty(n_labels, dtype=float)

for i in range(n_labels):
    p, r, t = precision_recall_curve(y_test[:, i], y_proba[:, i])
    # Compute F1 for each threshold-aligned point
    f1 = 2 * p * r / (p + r + 1e-12)
    # Align with thresholds: drop the last precision/recall point (no threshold for it)
    f1 = f1[:-1]

    if t.size == 0 or np.all(np.isnan(f1)):
        # Fallback if no positive examples or degenerate curve
        best_thresholds[i] = 0.5
    else:
        j = np.nanargmax(f1)
        best_thresholds[i] = t[j]

print("Per-class thresholds:", best_thresholds)  # one threshold per genre

# Apply thresholds (broadcasts across columns)
y_pred = (y_proba >= best_thresholds).astype(int)

# Re-evaluate
from sklearn.metrics import f1_score, classification_report
print("Micro-F1:", f1_score(y_test, y_pred, average="micro"))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
# If you have mlb:
# print(classification_report(y_test, y_pred, target_names=mlb.classes_))