In [1]:
import pandas as pd
movies = pd.read_csv("Data/movies.csv")
ratings = pd.read_csv("Data/ratings.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [2]:
# Cleaning movie titles using Python's regular expressions.

import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [3]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [4]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017
9739,193585,Flint (2017),Drama,Flint 2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018


In [5]:
# Building the search engine

# Creating a tfidf matrix 

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [6]:
#Creating a search function

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [7]:
# Building an interactive search box with Jupyter

import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [8]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [9]:
ratings = pd.read_csv("Data/ratings.csv") # Dataset used to build the recommendation system
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [11]:
# Finding users who liked the same movie
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_users

array([ 52,  68, 154, 184, 210, 211, 249, 291, 308, 344, 377, 380, 382,
       393, 489, 509, 511, 522, 525, 550, 561, 573, 582, 586, 601, 610],
      dtype=int64)

In [12]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_user_recs

7784         318
7785         356
7786         364
7787         588
7788         733
           ...  
100821    160527
100829    164179
100832    168248
100833    168250
100834    168252
Name: movieId, Length: 2253, dtype: int64

In [13]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

similar_user_recs

89745     1.000000
2571      0.615385
79132     0.615385
112852    0.576923
58559     0.576923
            ...   
79091     0.115385
2115      0.115385
1617      0.115385
5903      0.115385
3753      0.115385
Name: movieId, Length: 241, dtype: float64

In [14]:
# Finding how much all users like movies
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
3,1,47,5.0,964983815
4,1,50,5.0,964982931
10,1,163,5.0,964983650
15,1,260,5.0,964981680
25,1,457,5.0,964981909
...,...,...,...,...
100743,610,122920,5.0,1493845626
100780,610,139385,4.5,1493846777
100814,610,158238,5.0,1479545219
100829,610,164179,5.0,1493845631


In [15]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

all_user_recs

318       0.353765
296       0.292469
356       0.271454
2571      0.262697
2959      0.227671
            ...   
86377     0.007005
158238    0.007005
88744     0.007005
93721     0.005254
112623    0.005254
Name: movieId, Length: 241, dtype: float64

In [16]:
# Creating a recommendation score
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)

rec_percentages.columns = ["similar", "all"]

rec_percentages

Unnamed: 0,similar,all
1,0.153846,0.113835
6,0.115385,0.056042
10,0.153846,0.035026
32,0.153846,0.099825
47,0.192308,0.141856
...,...,...
158238,0.115385,0.007005
164179,0.153846,0.014011
166528,0.230769,0.019264
168252,0.153846,0.022767


In [17]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

rec_percentages = rec_percentages.sort_values("score", ascending=False)

rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
8469,0.115385,0.005254,21.961538,112623,Dawn of the Planet of the Apes (2014),Sci-Fi,Dawn of the Planet of the Apes 2014
8696,0.192308,0.008757,21.961538,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
7850,0.115385,0.005254,21.961538,93721,Jiro Dreams of Sushi (2011),Documentary,Jiro Dreams of Sushi 2011
8301,0.192308,0.008757,21.961538,106642,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,Day of the Doctor The 2013
7693,1.0,0.045534,21.961538,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
8151,0.230769,0.012259,18.824176,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
8451,0.192308,0.010508,18.301282,112175,How to Train Your Dragon 2 (2014),Action|Adventure|Animation,How to Train Your Dragon 2 2014
8689,0.192308,0.010508,18.301282,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
8395,0.307692,0.017513,17.569231,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
8686,0.153846,0.008757,17.569231,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015


In [18]:
# - Building a recommendation function
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [19]:
# - Creating an interactive recommendation widget
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [20]:
tags = pd.read_csv("Data/tags.csv")
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [21]:
links = pd.read_csv("Data/links.csv")

links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
