In [17]:
import sys
import os
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Add the project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


In [18]:
from src import data_loader

# Load the cleaned data
df = data_loader.load_and_preprocess_data()

# Preview the result
df.head(10)


Unnamed: 0,userId,movieId,rating,date,title,genres,year
0,1,1,4.0,2000-07-30,Toy Story (1995),Adventure,1995
1,1,1,4.0,2000-07-30,Toy Story (1995),Animation,1995
2,1,1,4.0,2000-07-30,Toy Story (1995),Children,1995
3,1,1,4.0,2000-07-30,Toy Story (1995),Comedy,1995
4,1,1,4.0,2000-07-30,Toy Story (1995),Fantasy,1995
5,1,3,4.0,2000-07-30,Grumpier Old Men (1995),Comedy,1995
6,1,3,4.0,2000-07-30,Grumpier Old Men (1995),Romance,1995
7,1,6,4.0,2000-07-30,Heat (1995),Action,1995
8,1,6,4.0,2000-07-30,Heat (1995),Crime,1995
9,1,6,4.0,2000-07-30,Heat (1995),Thriller,1995


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274480 entries, 0 to 274479
Data columns (total 7 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   274480 non-null  int64  
 1   movieId  274480 non-null  int64  
 2   rating   274480 non-null  float64
 3   date     274480 non-null  object 
 4   title    274480 non-null  object 
 5   genres   274480 non-null  object 
 6   year     274453 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 14.7+ MB


In [20]:
df['userId'] = df['userId'].astype('int32')
df['movieId'] = df['movieId'].astype('int32')
df['rating'] = df['rating'].astype('float32')

df['date'] = pd.to_datetime(df['date'], errors='coerce')  # handles bad dates as NaT
df['title'] = df['title'].astype('string')
df['genres'] = df['genres'].astype('category')

df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int16')  # or .astype('string') if needed


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274480 entries, 0 to 274479
Data columns (total 7 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   userId   274480 non-null  int32         
 1   movieId  274480 non-null  int32         
 2   rating   274480 non-null  float32       
 3   date     274480 non-null  datetime64[ns]
 4   title    274480 non-null  string        
 5   genres   274480 non-null  category      
 6   year     274453 non-null  Int16         
dtypes: Int16(1), category(1), datetime64[ns](1), float32(1), int32(2), string(1)
memory usage: 8.4 MB


In [22]:
df

Unnamed: 0,userId,movieId,rating,date,title,genres,year
0,1,1,4.0,2000-07-30,Toy Story (1995),Adventure,1995
1,1,1,4.0,2000-07-30,Toy Story (1995),Animation,1995
2,1,1,4.0,2000-07-30,Toy Story (1995),Children,1995
3,1,1,4.0,2000-07-30,Toy Story (1995),Comedy,1995
4,1,1,4.0,2000-07-30,Toy Story (1995),Fantasy,1995
...,...,...,...,...,...,...,...
274475,610,168252,5.0,2017-05-03,Logan (2017),Sci-Fi,2017
274476,610,170875,3.0,2017-05-03,The Fate of the Furious (2017),Action,2017
274477,610,170875,3.0,2017-05-03,The Fate of the Furious (2017),Crime,2017
274478,610,170875,3.0,2017-05-03,The Fate of the Furious (2017),Drama,2017


In [23]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



df['genres'] = df['genres'].str.replace('|', ' ', regex=False)
df['title'] = df['title'].str.replace(r'\\s*\\(\\d{4}\\)\\s*$', '', regex=True)
id_to_title = pd.Series(df.title.values, index=df.index)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['genres'])

model_components = {
    'vectorizer': tfidf_vectorizer,
    'tfidf_matrix': tfidf_matrix,
    'id_to_title': id_to_title
}

model_filename = "genre_to_title_model.pkl"
with open(model_filename, 'wb') as f:
    pickle.dump(model_components, f)

print(f"Model components saved to {model_filename}")


# --- Example of Loading and Using the Model ---
with open(model_filename, 'rb') as f:
    loaded_model = pickle.load(f)

print(f"\\nModel loaded from {model_filename}")

def predict_title_from_genres(genres_list):
    vectorizer = loaded_model['vectorizer']
    movie_matrix = loaded_model['tfidf_matrix']
    id_to_title_map = loaded_model['id_to_title']

    input_string = ' '.join(genres_list)
    input_vector = vectorizer.transform([input_string])
    
    cosine_similarities = cosine_similarity(input_vector, movie_matrix).flatten()
    
    most_similar_movie_index = np.argmax(cosine_similarities)
    
    predicted_title = id_to_title_map[most_similar_movie_index]
    
    return predicted_title

input_genres = ['Action', 'Thriller']
predicted_title = predict_title_from_genres(input_genres)
print(f"\\nInput Genres: {input_genres}")
print(f"Predicted Title: {predicted_title}")

input_genres_2 = ['Adventure', 'Comedy', 'Animation']
predicted_title_2 = predict_title_from_genres(input_genres_2)
print(f"\\nInput Genres: {input_genres_2}")
print(f"Predicted Title: {predicted_title_2}")

Model components saved to genre_to_title_model.pkl
\nModel loaded from genre_to_title_model.pkl
\nInput Genres: ['Action', 'Thriller']
Predicted Title: Heat (1995)
\nInput Genres: ['Adventure', 'Comedy', 'Animation']
Predicted Title: Toy Story (1995)


In [27]:
column_values = df['genres']
column_values.value_counts()

genres
Drama                 41928
Comedy                39053
Action                30635
Thriller              26452
Adventure             24161
Romance               18124
Sci-Fi                17243
Crime                 16681
Fantasy               11834
Children               9208
Mystery                7674
Horror                 7291
Animation              6988
War                    4859
IMAX                   4145
Musical                4138
Western                1930
Documentary            1219
Film-Noir               870
(no genres listed)       47
Name: count, dtype: int64