In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import tensorflow as tf






[nltk_data] Downloading package punkt to /Users/paolochan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
## Part one: Gathering data

#movies
df_movies = pd.read_csv("imdb_top_1000.csv")  # or whatever your filename is
df_movies["Series_Title"].head()

#Synthetic user data
np.random.seed(12)
user_ratings = {title: np.round(np.random.uniform(0.0, 1.0), 2) for title in df_movies['Series_Title']}

# Assign to DataFrame
df_movies['User_Rating'] = df_movies['Series_Title'].map(user_ratings)

In [None]:
## Step 2: preprocessing data

## Cleaning 
df_movies= df_movies.drop(["Gross", "Poster_Link", "Certificate", "No_of_Votes","Meta_score","Runtime"], axis=1) ## Drop data we don't want to use
df_movies["Series_Title"]= df_movies["Series_Title"].str.lower().str.strip() 
df_movies["Genre"]= df_movies["Genre"].str.lower().str.strip()
df_movies['Genre_List'] = df_movies['Genre'].str.split(', ') #splitting genres into a list instead of string

## Merge the actors into one list 
df_movies["Stars"] = df_movies.apply(
    lambda row: [
        str(row["Star1"]).lower().strip(),
        str(row["Star2"]).lower().strip(),
        str(row["Star3"]).lower().strip(),
        str(row["Star4"]).lower().strip()
    ], axis=1
)
actor_counts = Counter(actor for sublist in df_movies['Stars'] for actor in sublist) # count actor instances

top_actors = [actor for actor, count in actor_counts.items() if count >= 3]  # top actors appear more than 5 times
# Keep only top actors in each movie
def filter_top_actors(stars_list):
    return [actor for actor in stars_list if actor in top_actors]
df_movies['Stars'] = df_movies['Stars'].apply(filter_top_actors)

#Normalize IMDB rating 
scaler = MinMaxScaler()
df_movies['Nrating']= scaler.fit_transform(df_movies[['IMDB_Rating']])


#Encoding 

#encoding genres
mlb = MultiLabelBinarizer()
##mlb.fit_transform([df_movies['Genre_List']])
genre_encoded = pd.DataFrame(
    mlb.fit_transform(df_movies['Genre_List']), columns=mlb.classes_, index=df_movies['Series_Title'])

#Encoding Director
top_25_directors = df_movies['Director'].value_counts().nlargest(25).index
df_movies['Director'] = df_movies['Director'].where(df_movies['Director'].isin(top_25_directors), 'Other')
ohe = OneHotEncoder(sparse=False)
director_encoded= pd.DataFrame(ohe.fit_transform(df_movies[['Director']]),columns=ohe.get_feature_names_out(['Director']),index=df_movies['Series_Title'])

#Encoding actors
mlba = MultiLabelBinarizer()
actor_encoded = pd.DataFrame(mlba.fit_transform(df_movies['Stars']), columns= mlba.classes_, index= df_movies['Series_Title'])



#Tokenizing and Embedding of overview
df_movies['tokens'] = df_movies['Overview'].fillna('').apply(word_tokenize)

w2v_model = Word2Vec(sentences=df_movies["tokens"], vector_size=100, window=5, min_count=2, workers=4)
def average_vector(tokens) :
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)
df_movies['overview_vector'] = df_movies['tokens'].apply(average_vector)

X = np.hstack([
    genre_encoded.values,
    actor_encoded.values,
    director_encoded.values,
    np.vstack(df_movies['overview_vector']),
    df_movies[['Nrating']].values
])
y=df_movies['User_Rating'].values





In [None]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12
)


In [None]:
#Using Keras to make a neural network

model = model.sequential