# Movie Recommender System


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
scores = pd.read_csv('genome-scores.csv')
scores.info()
scores.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18472128 entries, 0 to 18472127
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 422.8 MB


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.032
1,1,2,0.02225
2,1,3,0.07
3,1,4,0.059
4,1,5,0.123


In [3]:
tags = pd.read_csv('genome-tags.csv')
tags.info()
tags.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [4]:
links = pd.read_csv('links.csv')
links.info()
links.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  86537 non-null  int64  
 1   imdbId   86537 non-null  int64  
 2   tmdbId   86411 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.0 MB


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
tags = pd.read_csv('tags.csv')
tags.info()
tags.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328315 entries, 0 to 2328314
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   userId     int64 
 1   movieId    int64 
 2   tag        object
 3   timestamp  int64 
dtypes: int64(3), object(1)
memory usage: 71.1+ MB


Unnamed: 0,userId,movieId,tag,timestamp
0,10,260,good vs evil,1430666558
1,10,260,Harrison Ford,1430666505
2,10,260,sci-fi,1430666538
3,14,1221,Al Pacino,1311600756
4,14,1221,mafia,1311600746


In [6]:
movies = pd.read_csv('movies.csv')
movies.info()
movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
import re

def clean_title(title):
    return re.sub(r'[^a-zA-Z0-9 ]', '', title)

In [8]:
movies["clean_title"] = movies["title"].apply(clean_title)
display(movies.head())

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


Turn title into number for search engine

In [9]:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies['clean_title'])

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def search(title):
    title = clean_title(title)
    title_tfidf = vectorizer.transform([title])  # Transform the cleaned title to numerical features
    similarities = cosine_similarity(title_tfidf, tfidf).flatten() 
    indices = np.argpartition(similarities, -5)[-5:] # 5 most similar titles
    results = movies.iloc[indices][::-1]
    return results


Widget for searching

In [11]:
from ipywidgets import widgets
from IPython.display import display

movie_search = widgets.Text(
    description='Search:',
    placeholder='Enter movie title'
)
movie_list = widgets.Output()

def on_search_change(data):
    with movie_list:
        movie_list.clear_output() #remove everything
        title = data['new']  #get search title
        if len(title) > 5: #only search if more than 5 characters
            display(search(title))

movie_search.observe(on_search_change, names='value')

display(movie_search,movie_list)

Text(value='', description='Search:', placeholder='Enter movie title')

Output()

Implement the recommendation based on other user's reviews (what else do they like)

In [12]:
ratings = pd.read_csv('ratings.csv')
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


Find the user who also like the movie we like

In [13]:
movieId = 1

In [14]:
# find similar users who also like the movie that we search
similar_users = ratings[(ratings["movieId"]==movieId) & (ratings["rating"]>4)]["userId"].unique()
similar_users

array([     2,     12,     24, ..., 330947, 330951, 330955])

In [15]:
# find similar movies liked by similar users that like the same movie that we search
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
similar_user_recs

62             1
67            17
69            21
72            34
73            36
            ... 
33830278     786
33830279     788
33830280     802
33830281     805
33830282    1073
Name: movieId, Length: 1789422, dtype: int64

In [16]:
similar_user_recs.value_counts() # how many times each movie was recommended in the dataset

movieId
1         25417
318       10782
260        9789
356        9099
296        8794
          ...  
284297        1
282967        1
282727        1
279054        1
141558        1
Name: count, Length: 24580, dtype: int64

In [17]:
# Look movies that for > 10% of the users that are similar to us also like (narrow the recommendations down)
similar_user_recs = similar_user_recs.value_counts() # how many times each movie was recommended in the dataset
similar_user_recs = similar_user_recs / len(similar_users) #turn to %
similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
similar_user_recs

movieId
1         1.000000
318       0.424204
260       0.385136
356       0.357989
296       0.345989
            ...   
1208      0.104182
1387      0.103435
3996      0.102294
5418      0.101940
134853    0.101271
Name: count, Length: 106, dtype: float64

In [18]:
# Find how many users liked the recommended movies
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
7,1,1036,5.0,1225735626
12,1,1210,4.5,1225735210
14,1,1291,5.0,1225734809
...,...,...,...,...
33831754,330974,4963,5.0,1457563122
33831755,330974,4993,4.5,1457563097
33831759,330974,5952,4.5,1457563120
33831765,330974,7153,4.5,1457563106


There is likely that the movie that are recommended are just popular but not really related to the movie that we search. Then we need to see if the % of all users and % of the recommended users like the movie then the movie is not a great recommedation. We want movies that have a big differential in how they're recommened between ppl that have the same taste to you versus everybody in general.

In [19]:
# find % of users who liked these movies that are in similar_user_recs
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) #turn to %
all_users_recs


movieId
318       0.314060
296       0.235714
2571      0.219088
356       0.203730
2959      0.192585
            ...   
1073      0.037133
134853    0.036077
1387      0.035993
1148      0.034668
78499     0.030072
Name: count, Length: 106, dtype: float64

In [20]:
#compare % of users who liked these movies that are in similar_user_recs
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.101681
318,0.424204,0.314060
260,0.385136,0.186964
356,0.357989,0.203730
296,0.345989,0.235714
...,...,...
1208,0.104182,0.064868
1387,0.103435,0.035993
3996,0.102294,0.048086
5418,0.101940,0.051507


In [21]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"] # ratio of similar to all. The higher the score, the more similar the users are or the better
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.101681,9.834678
3114,0.267262,0.042281,6.321030
78499,0.158516,0.030072,5.271241
4886,0.234843,0.060076,3.909116
6377,0.223905,0.059460,3.765670
...,...,...,...
858,0.259787,0.182183,1.425963
318,0.424204,0.314060,1.350710
2959,0.255852,0.192585,1.328519
79132,0.178699,0.135129,1.322432


In [22]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.101681,9.834678,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.267262,0.042281,6.32103,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14815,0.158516,0.030072,5.271241,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4781,0.234843,0.060076,3.909116,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6259,0.223905,0.05946,3.76567,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
580,0.197978,0.052843,3.746544,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
359,0.242751,0.067141,3.615561,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
8248,0.204706,0.057687,3.548532,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
587,0.161231,0.04601,3.504261,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
1047,0.12771,0.037133,3.439276,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971


Let's put everything into functions

In [23]:
def find_similar_movies(movieId):
    # find similar users who also like the movie that we search
    similar_users = ratings[(ratings["movieId"]==movieId) & (ratings["rating"]>4)]["userId"].unique()
    # find similar movies liked by similar users that like the same movie that we search
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
    # Look movies that for > 10% of the users that are similar to us also like (narrow the recommendations down)
    similar_user_recs = similar_user_recs.value_counts() # how many times each movie was recommended in the dataset
    similar_user_recs = similar_user_recs / len(similar_users) #turn to %
    similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
    # Find how many users liked the recommended movies
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
    # find % of users who liked these movies that are in similar_user_recs
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) #turn to %
    #compare % of users who liked these movies that are in similar_user_recs
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    # score = ratio of similar to all. The higher the score, the more similar the users are or the better
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"] 
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    #pick top 10 recommendations
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["movieId", "title", "genres", "score"]]

In [24]:
find_similar_movies(1)

Unnamed: 0,movieId,title,genres,score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9.834678
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,6.32103
14815,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,5.271241
4781,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.909116
6259,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,3.76567
580,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,3.746544
359,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,3.615561
8248,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,3.548532
587,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,3.504261
1047,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,3.439276
