In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pickle

#### Download NLTK data

In [2]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Harshit
[nltk_data]     Tomar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Load your dataset

In [3]:
df = pd.read_csv('data.csv')

#### Handle missing values

In [4]:

df['vote_average'].fillna(df['vote_average'].mean(), inplace=True)
df['vote_count'].fillna(df['vote_count'].mean(), inplace=True)
df['genres'].fillna('', inplace=True)
df['cast'].fillna('', inplace=True)
df['crew'].fillna('', inplace=True)
df['overview'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['vote_average'].fillna(df['vote_average'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['vote_count'].fillna(df['vote_count'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

#### Apply stemming to the 'overview' column

In [5]:

stemmer = PorterStemmer()
df['overview'] = df['overview'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

#### Normalize numeric columns

In [6]:

scaler = MinMaxScaler()
df[['popularity', 'vote_average', 'vote_count']] = scaler.fit_transform(df[['popularity', 'vote_average', 'vote_count']])

#### Encode genres using MultiLabelBinarizer

In [7]:

mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df['genres'].apply(lambda x: x.split(',')))

#### Vectorize cast and crew using CountVectorizer

In [8]:


cast_vectorizer = CountVectorizer(stop_words='english')
cast_encoded = cast_vectorizer.fit_transform(df['cast'])

crew_vectorizer = CountVectorizer(stop_words='english')
crew_encoded = crew_vectorizer.fit_transform(df['crew'])

#### TF-IDF for 'overview'

In [9]:

tfidf = TfidfVectorizer(stop_words='english')
overview_tfidf = tfidf.fit_transform(df['overview'])

#### Combine all features into a single matrix

In [10]:
from scipy.sparse import hstack
combined_features = hstack([genres_encoded, df[['popularity', 'vote_average', 'vote_count']], overview_tfidf, cast_encoded, crew_encoded])

#### Perform SVD to reduce dimensions

In [11]:

svd = TruncatedSVD(n_components=1000)
reduced_features = svd.fit_transform(combined_features)

#### K-Nearest Neighbors model

In [12]:

knn = NearestNeighbors(metric='cosine')
knn.fit(reduced_features)

#### Random Forest model

In [13]:

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(reduced_features, df.index)

#### Save dictionaries and models using pickle

In [14]:

models_and_data = {
    'id_to_title': df.set_index('movie_id')['title'].to_dict(),
    'title_to_id': df.set_index('title')['movie_id'].to_dict(),
    'id_to_genres': df.set_index('movie_id')['genres'].to_dict(),
    'id_to_overview': df.set_index('movie_id')['overview'].to_dict(),
    'id_to_cast': df.set_index('movie_id')['cast'].to_dict(),
    'reduced_features': reduced_features,
    'knn_model': knn,
    'rf_model': rf
}

with open('models_and_data.pkl', 'wb') as f:
    pickle.dump(models_and_data, f)