In [22]:
import pandas as pd
import numpy as np
import json
import nltk
nltk.download('punkt')

import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pickle


from sklearn.metrics import accuracy_score, f1_score


import warnings
warnings.filterwarnings('ignore')


# Load the dataset
data = pd.read_csv('movies_metadata.csv')

# Extract required columns
data = data[['overview', 'genres']]

# Clean the data
def parse(x):
    names = []
    x = eval(x)
    for dictionary in x:
        names.append(dictionary['name'])
    return names

data['target'] = data['genres'].apply(parse)
data = data[data['target'].apply(lambda x: len(x)) > 0]
data = data.dropna(subset=['overview'])

# Text cleaning and stopword removal
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_cleaned = X.apply(self.clean_text)
        return X_cleaned
    
    def clean_text(self, text):
        text = re.sub("\'", "", text)
        text = re.sub("[^a-zA-Z]", " ", text)
        text = ' '.join(text.split())
        text = text.lower()
        text = self.remove_stopwords(text)
        return text
    
    def remove_stopwords(self, text):
        tokens = nltk.word_tokenize(text)
        tokens_cleaned = [token for token in tokens if token.lower() not in self.stopwords]
        return ' '.join(tokens_cleaned)

# Split the dataset
overview = data['overview']
genres = data['target']
X_train, X_test, y_train, y_test = train_test_split(overview, genres, test_size=0.5, random_state=42)

# Apply MultiLabelBinarizer to target labels
mlb = MultiLabelBinarizer()
y_train_binarized = mlb.fit_transform(y_train)

# Create the pipeline
pipeline = Pipeline([
    ('cleaner', TextPreprocessor()),
    ('tfidf', TfidfVectorizer(max_df=0.8, max_features=10000)),
    ('model', OneVsRestClassifier(LogisticRegression()))
])

# Train the model
pipeline.fit(X_train, y_train_binarized)


# Save the pipeline and MultiLabelBinarizer
pickle.dump(pipeline, open('model_pipeline.pkl', 'wb'))
pickle.dump(mlb, open('mlb.pkl', 'wb'))


mlb_new = pickle.load(open("mlb.pkl", 'rb'))
y_test_binarized = mlb_new.transform(y_test)


y_pred_prob = pipeline.predict_proba(X_test)
t = 0.3 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
accuracy = accuracy_score(y_test_binarized, y_pred_new)
f1 = f1_score(y_test_binarized, y_pred_new, average='micro')  # Micro-average F1-score for multi-label classification
print("Accuracy:", accuracy)
print("F1-score:", f1)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohamednoordeenalaudeen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.16206766206766207
F1-score: 0.5600685836094441
