In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack , csr_matrix
import joblib

In [2]:
combined_df = pd.read_csv(r"D:\Coding\Code\Python\ML\Data\IMDB\Cleaned Data\FinalCombined.csv")
combined_df

Unnamed: 0.1,Unnamed: 0,tconst,nconst,directors,genres,popularity_score
0,0,tt0000005,nm0443482,nm0005690,['short'],6.202478
1,1,tt0000005,nm0653042,nm0005690,['short'],6.202478
2,2,tt0000007,nm0179163,nm0005690,"['short', ' sport']",5.317792
3,3,tt0000007,nm0179163,nm0374658,"['short', ' sport']",5.317792
4,4,tt0000007,nm0183947,nm0005690,"['short', ' sport']",5.317792
...,...,...,...,...,...,...
10962033,10962033,tt9916840,nm1052583,nm0996406,"['adventure', ' animation', ' comedy']",6.905679
10962034,10962034,tt9916840,nm1052583,nm0996406,"['adventure', ' animation', ' comedy']",6.905679
10962035,10962035,tt9916840,nm2676923,nm0996406,"['adventure', ' animation', ' comedy']",6.905679
10962036,10962036,tt9916840,nm2676923,nm0996406,"['adventure', ' animation', ' comedy']",6.905679


In [3]:
print(combined_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10962038 entries, 0 to 10962037
Data columns (total 6 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Unnamed: 0        int64  
 1   tconst            object 
 2   nconst            object 
 3   directors         object 
 4   genres            object 
 5   popularity_score  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 501.8+ MB
None


In [4]:
# Preprocess data
# Handle genres (multi-label categorical)
genre_vectorizer = CountVectorizer(tokenizer=lambda x: eval(x))  # Assumes genres column is stored as a stringified list
genre_matrix = genre_vectorizer.fit_transform(combined_df['genres'].fillna('[]'))

# Handle actors (nconst) and directors as text features
actor_vectorizer = CountVectorizer()
actor_matrix = actor_vectorizer.fit_transform(combined_df['nconst'].fillna(''))

director_vectorizer = CountVectorizer()
director_matrix = director_vectorizer.fit_transform(combined_df['directors'].fillna(''))



In [5]:
# Normalize popularity_score
scaler = MinMaxScaler()
combined_df['popularity_score'] = scaler.fit_transform(combined_df[['popularity_score']])

In [6]:
popularity_matrix = csr_matrix(np.expand_dims(combined_df['popularity_score'].values, axis=1))

In [7]:
# Combine features for similarity computation
genre_weight = 0.4
actor_weight = 0.35
director_weight = 0.2
popularity_weight = 0.05

feature_matrix = hstack([
    genre_matrix * genre_weight,
    actor_matrix * actor_weight,
    director_matrix * director_weight,
    popularity_matrix * popularity_weight
]).tocsr()

In [8]:
# Save the preprocessed feature matrix and combined_df for external use
joblib.dump((feature_matrix, combined_df), "movie_feature_model.pkl")
print("Feature matrix and data saved.")

Feature matrix and data saved.
