# Import Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Load Dataset

In [2]:
movie_df=pd.read_csv("movies.csv")
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Data Preprocessing

In [3]:
movie_df['title']=movie_df['title'].apply(lambda t: t.lower())
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,jumanji (1995),Adventure|Children|Fantasy
2,3,grumpier old men (1995),Comedy|Romance
3,4,waiting to exhale (1995),Comedy|Drama|Romance
4,5,father of the bride part ii (1995),Comedy


In [4]:
# Checking for null values
movie_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [5]:
# Text Vectorization and Similarity Matrix
vectorizer=TfidfVectorizer(stop_words='english')
vectors=vectorizer.fit_transform(movie_df['genres']).toarray()
sim_matrix=cosine_similarity(vectors)
sim_matrix.shape

(9742, 9742)

In [6]:
# Create a DataFrame similarity matrix
simi_cnt_df=pd.DataFrame(sim_matrix,index=movie_df['title'],
                     columns=movie_df['title'])
simi_cnt_df.head()

title,toy story (1995),jumanji (1995),grumpier old men (1995),waiting to exhale (1995),father of the bride part ii (1995),heat (1995),sabrina (1995),tom and huck (1995),sudden death (1995),goldeneye (1995),...,gintama: the movie (2010),anohana: the flower we saw that day - the movie (2013),silver spoon (2014),love live! the school idol movie (2015),jon stewart has left the building (2015),black butler: book of the atlantic (2017),no game no life: zero (2017),flint (2017),bungo stray dogs: dead apple (2018),andrew dice clay: dice rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
toy story (1995),1.0,0.813578,0.152769,0.135135,0.267586,0.0,0.152769,0.654698,0.0,0.262413,...,0.360397,0.465621,0.196578,0.516225,0.0,0.680258,0.755891,0.0,0.421037,0.267586
jumanji (1995),0.813578,1.0,0.0,0.0,0.0,0.0,0.0,0.804715,0.0,0.322542,...,0.0,0.0,0.0,0.0,0.0,0.341376,0.379331,0.0,0.0,0.0
grumpier old men (1995),0.152769,0.0,1.0,0.884571,0.570915,0.0,1.0,0.0,0.0,0.0,...,0.162848,0.0,0.419413,0.0,0.0,0.181883,0.202105,0.0,0.0,0.570915
waiting to exhale (1995),0.135135,0.0,0.884571,1.0,0.505015,0.0,0.884571,0.0,0.0,0.0,...,0.144051,0.201391,0.68744,0.0,0.0,0.160888,0.178776,0.466405,0.0,0.505015
father of the bride part ii (1995),0.267586,0.0,0.570915,0.505015,1.0,0.0,0.570915,0.0,0.0,0.0,...,0.28524,0.0,0.734632,0.0,0.0,0.318581,0.354002,0.0,0.0,1.0


In [7]:
# Content-Based Recommendation Function
def content_recommend(title):
    title=title.lower()
    if title not in simi_cnt_df.index:
        return "Movie is not in our dataset"
    sim_score=simi_cnt_df[title]
    sim_score=sim_score.sort_values(ascending=False)[1:11]
    df= pd.DataFrame({
        'movie':sim_score.index,
        'content_score':sim_score.values
    })
    return df

In [8]:
# Inference
Movie='toy story (1995)'

content_recommend(Movie)

Unnamed: 0,movie,content_score
0,toy story 2 (1999),1.0
1,"tale of despereaux, the (2008)",1.0
2,asterix and the vikings (astérix et les viking...,1.0
3,shrek the third (2007),1.0
4,turbo (2013),1.0
5,"monsters, inc. (2001)",1.0
6,the good dinosaur (2015),1.0
7,antz (1998),1.0
8,"emperor's new groove, the (2000)",1.0
9,moana (2016),1.0


# Load Dataset for Ratings

In [9]:
rating_df=pd.read_csv('ratings.csv')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Data Preprocessing

In [10]:
rating_df.drop(columns=['timestamp'])

rate_m_df=pd.merge(movie_df,rating_df,on='movieId')[['title','userId','rating']]
rate_m_df.head()

Unnamed: 0,title,userId,rating
0,toy story (1995),1,4.0
1,toy story (1995),5,4.0
2,toy story (1995),7,4.5
3,toy story (1995),15,2.5
4,toy story (1995),17,4.5


In [11]:
# Pivot table for user-item matrix
rating_per_user=pd.pivot_table(data=rate_m_df,index='title',
                               columns='userId',values='rating').fillna(0)
rating_per_user.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
"'burbs, the (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'hellboy': the seeds of creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'night mother (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'round midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Similarity matrix based on user ratings
rating_matrix=cosine_similarity(rating_per_user.values)
rating_matrix.shape

(9719, 9719)

In [13]:
# DataFrame for user-based similarity matrix
simi_colb_df=pd.DataFrame(rating_matrix,index=rating_per_user.index,
                          columns=rating_per_user.index)
simi_colb_df.head()

title,'71 (2014),"'burbs, the (1989)",'hellboy': the seeds of creation (2004),'night mother (1986),'round midnight (1986),'salem's lot (2004),'til there was you (1997),'tis the season for love (2015),(500) days of summer (2009),*batteries not included (1987),...,zookeeper (2011),zoolander (2001),zoolander 2 (2016),zoom (2006),zoom (2015),zootopia (2016),zulu (1964),zulu (2013),¡three amigos! (1986),à nous la liberté (freedom for us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.149201,0.0,0.0,0.0,0.178042,0.0,0.0,0.0,0.0
"'burbs, the (1989)",0.0,1.0,0.0,0.071429,0.176777,0.0,0.0,0.0,0.099735,0.248271,...,0.0,0.091918,0.0,0.0,0.0,0.038152,0.025986,0.0,0.372876,0.0
'hellboy': the seeds of creation (2004),0.0,0.0,1.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'night mother (1986),0.0,0.071429,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.363803,0.0,0.180009,0.0
'round midnight (1986),0.0,0.176777,0.707107,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Collaborative Filtering Recommendation Function
def collaborative_recommend(title):
    title=title.lower()
    if title not in simi_colb_df.index:
        return "Movie is not in our dataset"
    simi_vector=simi_colb_df[title]
    simi_scores=simi_vector.sort_values(ascending=False)[1:11]
    df=pd.DataFrame({
        'movie':simi_scores.index,
        'collab_score':simi_scores.values
    })
    return df
    

In [15]:
# Inference
movie="""toy story (1995)"""

collaborative_recommend(movie)

Unnamed: 0,movie,collab_score
0,toy story 2 (1999),0.572601
1,jurassic park (1993),0.565637
2,independence day (a.k.a. id4) (1996),0.564262
3,star wars: episode iv - a new hope (1977),0.557388
4,forrest gump (1994),0.547096
5,"lion king, the (1994)",0.541145
6,star wars: episode vi - return of the jedi (1983),0.541089
7,mission: impossible (1996),0.538913
8,groundhog day (1993),0.534169
9,back to the future (1985),0.530381


In [16]:
# Hybrid Recommendation Function combining both content-based and collaborative filtering

def hybrid_recommendation(title):
    content_df=content_recommend(title)
    collab_df=collaborative_recommend(title)
    combined=pd.merge(content_df,collab_df,on='movie',how='outer').fillna(0)
    scaler = MinMaxScaler()
    combined[['content_scaled', 'collab_scaled']] = scaler.fit_transform(
        combined[['content_score', 'collab_score']]
    )
    
    # Weighted combination (example: 30% content, 70% collab)
    combined['final_score'] = 0.3 * combined['content_scaled']+ 0.7 * combined['collab_scaled']
    
    # Sort top 10
    final_recommendation=combined.sort_values('final_score',
                                              ascending=False)[['movie',
                                                                'final_score']][:10]
                                              
    return final_recommendation.reset_index(drop=True)
    

In [17]:
# Inference
movie="""toy story (1995)"""
hybrid_recommendation(movie)

Unnamed: 0,movie,final_score
0,toy story 2 (1999),1.0
1,jurassic park (1993),0.691486
2,independence day (a.k.a. id4) (1996),0.689805
3,star wars: episode iv - a new hope (1977),0.681402
4,forrest gump (1994),0.66882
5,"lion king, the (1994)",0.661545
6,star wars: episode vi - return of the jedi (1983),0.661477
7,mission: impossible (1996),0.658816
8,groundhog day (1993),0.653017
9,back to the future (1985),0.648387
