# FILMCEPTION

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load datasets (replace paths with your actual files)
train_df = pd.read_csv("train_genre_dataset.tsv", sep="\t")
test_df = pd.read_csv("test_genre_dataset.tsv", sep="\t")

# Check columns
print(train_df.columns)

Index(['wiki_id', 'summary', 'Absurdism', 'Acid western', 'Action',
       'Action Comedy', 'Action Thrillers', 'Action/Adventure',
       'Addiction Drama', 'Adult',
       ...
       'Werewolf fiction', 'Western', 'Whodunit', 'Women in prison films',
       'Workplace Comedy', 'World History', 'World cinema', 'Wuxia', 'Z movie',
       'Zombie Film'],
      dtype='object', length=365)


In [2]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special chars
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

train_df['processed_summary'] = train_df['summary'].apply(preprocess_text)
test_df['processed_summary'] = test_df['summary'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit on training data
X_train = tfidf.fit_transform(train_df['summary'])
X_test = tfidf.transform(test_df['summary'])

# Get genre labels (all columns except 'wiki_id' and 'summary')
genre_columns = train_df.columns[2:-1]  # Exclude 'wiki_id', 'summary', 'processed_summary'
y_train = train_df[genre_columns]
y_test = test_df[genre_columns]

In [4]:
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

lgbm = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=7,
    random_state=42,
    class_weight='balanced',  # Helps with imbalanced genres
    feature_name='auto',  # Explicitly handle feature names
    verbose=-1  # Silence LightGBM logs
)

# Wrap in MultiOutputClassifier for multi-label
model = MultiOutputClassifier(lgbm, n_jobs=-1)  # n_jobs=-1 uses all CPU cores

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

In [5]:
print(classification_report(y_test, y_pred, target_names=genre_columns, zero_division=0))

                                          precision    recall  f1-score   support

                               Absurdism       0.00      0.00      0.00        16
                            Acid western       0.00      0.00      0.00         3
                                  Action       0.40      0.67      0.51      1181
                           Action Comedy       0.04      0.04      0.04        26
                        Action Thrillers       0.09      0.31      0.14        86
                        Action/Adventure       0.29      0.65      0.41       733
                         Addiction Drama       0.11      0.12      0.12         8
                                   Adult       0.03      0.08      0.04        25
                               Adventure       0.30      0.63      0.41       669
                        Adventure Comedy       0.07      0.11      0.09        28
                  Airplanes and airports       0.00      0.00      0.00         9
               

In [6]:
from sklearn.metrics import hamming_loss, f1_score, precision_score, recall_score, accuracy_score

# Calculate additional metrics
micro_precision = precision_score(y_test, y_pred, average='micro')
micro_recall = recall_score(y_test, y_pred, average='micro')
macro_precision = precision_score(y_test, y_pred, average='macro')
macro_recall = recall_score(y_test, y_pred, average='macro')

print("\n=== Detailed Performance Metrics ===")
print("Micro F1:", f1_score(y_test, y_pred, average='micro'))
print("Macro F1:", f1_score(y_test, y_pred, average='macro'))
print(f"Micro-average Precision: {micro_precision:.4f}")
print(f"Micro-average Recall: {micro_recall:.4f}")
print(f"Macro-average Precision: {macro_precision:.4f}")
print(f"Macro-average Recall: {macro_recall:.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Hamming Loss: {hamming_loss(y_test, y_pred):.4f}")

# Get top 10 genres by frequency
genre_counts = y_test.sum().sort_values(ascending=False).head(10)
print("\nTop 10 Genres by Frequency:")
print(genre_counts)

# Sample predictions vs actual for inspection
sample_idx = 0
print("\nSample Prediction Inspection:")
print(f"Summary: {test_df['summary'].iloc[sample_idx][:100]}...")
actual_genres = genre_columns[y_test.iloc[sample_idx].astype(bool)]
predicted_genres = genre_columns[y_pred[sample_idx].astype(bool)]
print(f"Actual Genres: {', '.join(actual_genres)}")
print(f"Predicted Genres: {', '.join(predicted_genres)}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== Detailed Performance Metrics ===
Micro F1: 0.33474924636886816


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Macro F1: 0.09513143282412972
Micro-average Precision: 0.2381
Micro-average Recall: 0.5637
Macro-average Precision: 0.0806
Macro-average Recall: 0.1545
Accuracy: 0.0006
Hamming Loss: 0.0224

Top 10 Genres by Frequency:
Drama              3840
Comedy             2089
Romance Film       1321
Thriller           1297
Action             1181
World cinema       1011
Crime Fiction       901
Horror              829
Indie               762
Black-and-white     758
dtype: int64

Sample Prediction Inspection:
Summary: j p tannen former professional golfer residing california estranged three child live new york mother...
Actual Genres: Drama, Family Drama, Indie
Predicted Genres: Coming of age, Drama, Family Drama


In [11]:
# ============== ADDED VISUALIZATIONS ============== #
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix

# 1. Genre Frequency Distribution (Top 20)
plt.figure(figsize=(12,6))
y_test.sum().sort_values(ascending=False).head(20).plot(kind='bar')
plt.title("Top 20 Most Frequent Genres in Test Data")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('genre_distribution.png')
plt.close()

# 2. Micro/Macro Metrics Comparison
metrics = {
    'Micro': [f1_score(y_test, y_pred, average='micro'),
              precision_score(y_test, y_pred, average='micro'),
              recall_score(y_test, y_pred, average='micro')],
    'Macro': [f1_score(y_test, y_pred, average='macro'),
              precision_score(y_test, y_pred, average='macro'),
              recall_score(y_test, y_pred, average='macro')]
}

plt.figure(figsize=(10,5))
sns.heatmap(pd.DataFrame(metrics, index=['F1', 'Precision', 'Recall']), 
            annot=True, fmt=".2f", cmap="Blues")
plt.title("Micro vs Macro Averages")
plt.savefig('micro_macro_comparison.png')
plt.close()

# 3. Confusion Matrix for Top Genre
top_genre = y_test.sum().idxmax()
top_idx = list(y_test.columns).index(top_genre)
cm = confusion_matrix(y_test.iloc[:, top_idx], y_pred[:, top_idx])

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', 
            xticklabels=['Not '+top_genre, top_genre],
            yticklabels=['Not '+top_genre, top_genre])
plt.title(f"Confusion Matrix for '{top_genre}'")
plt.savefig('top_genre_confusion_matrix.png')
plt.close()

# 4. Hamming Loss Breakdown by Genre
genre_hamming = (y_test != y_pred).mean()
plt.figure(figsize=(12,6))
genre_hamming.sort_values(ascending=False).head(15).plot(kind='bar')
plt.title("Genres with Highest Hamming Loss (Error Rate)")
plt.ylabel("Error Rate")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('hamming_loss_by_genre.png')
plt.close()

print("\nVisualizations saved as:")
print("- genre_distribution.png")
print("- micro_macro_comparison.png")
print("- top_genre_confusion_matrix.png")
print("- hamming_loss_by_genre.png")
# ============== END VISUALIZATIONS ============== #

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Visualizations saved as:
- genre_distribution.png
- micro_macro_comparison.png
- top_genre_confusion_matrix.png
- hamming_loss_by_genre.png


In [7]:
import joblib

joblib.dump(model, 'lightgbm_genre_classifier.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [8]:
def predict_genres(summary):
    # Load model & vectorizer
    model = joblib.load('lightgbm_genre_classifier.pkl')
    tfidf = joblib.load('tfidf_vectorizer.pkl')
    
    # Preprocess
    processed_summary = preprocess_text(summary)
    print(processed_summary)
    X_new = tfidf.transform([processed_summary])
    
    # Predict
    preds = model.predict(X_new)
    genres = genre_columns[preds[0].astype(bool)]  # Get predicted genres
    return list(genres)

# Example usage
new_summary = "A young boy discovers he has magical powers and must save the world from an evil wizard."
predicted_genres = predict_genres(new_summary)
print("Predicted Genres:", predicted_genres)

young boy discovers magical power must save world evil wizard
Predicted Genres: ['Animation', 'Comedy film', 'Family Film', 'Fantasy', 'Fantasy Adventure', 'Science Fiction', 'Short Film', 'Stop motion', 'Superhero movie', 'Television movie']


In [9]:
import json

# Save genre columns (needed for prediction)
with open('genre_columns.json', 'w') as f:
    json.dump(list(genre_columns), f)