In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
import re
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

credits_df = pd.read_csv('/content/tmdb_5000_credits.csv')
movies_df = pd.read_csv('/content/tmdb_5000_movies.csv')

# Merge datasets on 'id' or 'movie_id' columns
data = movies_df.merge(credits_df, left_on='id', right_on='movie_id')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Selecting the relevant columns
data = data[['overview', 'genres', 'budget', 'runtime', 'vote_average']]

# Convert the genres JSON format to a list of genre names
def extract_genres(genres):
    genres_list = json.loads(genres.replace("'", '"'))
    return [genre['name'] for genre in genres_list]

# Apply the function to the genres column
data['genres'] = data['genres'].apply(extract_genres)

# Remove rows with empty genres or missing overview
data = data[(data['genres'].str.len() > 0) & (data['overview'].notnull())]


In [13]:
# MultiLabelBinarizer to convert genres into multi-hot encoded format
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['genres'])

# Convert genre labels into a DataFrame and concatenate with the main DataFrame
genre_df = pd.DataFrame(y, columns=mlb.classes_)
data = pd.concat([data.reset_index(drop=True), genre_df], axis=1)


In [14]:

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess the overview text
def preprocess_text(text):
    if isinstance(text, str):  # Check if the text is valid
        text = text.lower()
        text = re.sub(r'\W', ' ', text)  # Remove punctuation
        tokens = text.split()
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    else:
        return ''

# Apply the preprocessing function to the overview column
data['processed_overview'] = data['overview'].apply(preprocess_text)


In [15]:
# TF-IDF vectorizer for the processed overview text
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(data['processed_overview'])

# Extract and process additional numerical features
X_numeric = data[['budget', 'runtime', 'vote_average']].fillna(0).values

# Combine text features and numerical features
from scipy.sparse import hstack
X = hstack([X_text, X_numeric])


In [16]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
from sklearn.multiclass import OneVsRestClassifier

# Initialize Logistic Regression with OneVsRestClassifier for multi-label classification
model = OneVsRestClassifier(LogisticRegression(max_iter=100, solver='saga'))
model.fit(X_train, y_train)




In [18]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=mlb.classes_))


Accuracy: 0.030366492146596858
Classification Report:
                  precision    recall  f1-score   support

         Action       0.00      0.00      0.00       242
      Adventure       0.00      0.00      0.00       156
      Animation       0.00      0.00      0.00        39
         Comedy       0.00      0.00      0.00       331
          Crime       0.00      0.00      0.00       160
    Documentary       0.00      0.00      0.00        15
          Drama       0.54      0.25      0.35       445
         Family       0.00      0.00      0.00        97
        Fantasy       0.00      0.00      0.00        94
        Foreign       0.00      0.00      0.00         8
        History       0.00      0.00      0.00        34
         Horror       0.00      0.00      0.00       108
          Music       0.00      0.00      0.00        39
        Mystery       0.00      0.00      0.00        84
        Romance       0.00      0.00      0.00       184
Science Fiction       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
import pickle

# Save the trained model
with open('genre_prediction_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

# Save the MultiLabelBinarizer for genre labels
with open('mlb_binarizer.pkl', 'wb') as file:
    pickle.dump(mlb, file)


In [20]:
# Loading the saved model, TF-IDF vectorizer, and MultiLabelBinarizer
with open('genre_prediction_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_tfidf = pickle.load(file)
with open('mlb_binarizer.pkl', 'rb') as file:
    loaded_mlb = pickle.load(file)

# Example
new_overview = "A young hero embarks on an epic journey to save the world from a powerful villain."

# Preprocess the new overview text
processed_overview = preprocess_text(new_overview)
X_new_text = loaded_tfidf.transform([processed_overview])

# Use dummy values for additional numerical features if needed
X_new = hstack([X_new_text, np.array([[0, 0, 0]])])  # Replace 0s with actual values if available

predicted_genres = loaded_model.predict(X_new)

# Transform the predicted labels back to genre names
predicted_genre_names = loaded_mlb.inverse_transform(predicted_genres)
print("Predicted genres:", predicted_genre_names)


Predicted genres: [('Drama',)]
