In [1]:
import pandas as pd

movies = pd.read_csv("movies.csv")

In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [4]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [9]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 1:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [12]:
ratings = pd.read_csv("ratings.csv")

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [14]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [15]:
movie_id = 4896

In [17]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_users

array([     2,     20,    117, ..., 162508, 162524, 162538], dtype=int64)

In [18]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_user_recs

72             110
74             151
76             260
79             318
80             333
             ...  
24999756     81845
24999761     93510
24999762     93988
24999769    102993
24999776    116797
Name: movieId, Length: 675001, dtype: int64

In [19]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [20]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [21]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [22]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [23]:
rec_percentages

Unnamed: 0,similar,all
1,0.276447,0.124484
32,0.102564,0.100096
47,0.187614,0.144186
50,0.189856,0.200119
110,0.192378,0.160556
...,...,...
119145,0.101723,0.020931
122886,0.119938,0.026926
122904,0.136612,0.034962
134130,0.151885,0.044751


In [24]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]


In [25]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [26]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
4790,1.0,0.04717,21.200084,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,Harry Potter and the Sorcerers Stone aka Harry...
5704,0.582738,0.036641,15.903887,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy,Harry Potter and the Chamber of Secrets 2002
11700,0.408155,0.029748,13.720472,54001,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy|IMAX,Harry Potter and the Order of the Phoenix 2007
10408,0.506515,0.037785,13.405336,40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX,Harry Potter and the Goblet of Fire 2005
13512,0.408435,0.032491,12.570839,69844,Harry Potter and the Half-Blood Prince (2009),Adventure|Fantasy|Mystery|Romance|IMAX,Harry Potter and the HalfBlood Prince 2009
7742,0.582878,0.048326,12.061317,8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX,Harry Potter and the Prisoner of Azkaban 2004
15538,0.394283,0.034097,11.563682,81834,Harry Potter and the Deathly Hallows: Part 1 (...,Action|Adventure|Fantasy|IMAX,Harry Potter and the Deathly Hallows Part 1 2010
16718,0.385316,0.035471,10.862722,88125,Harry Potter and the Deathly Hallows: Part 2 (...,Action|Adventure|Drama|Fantasy|Mystery|IMAX,Harry Potter and the Deathly Hallows Part 2 2011
10450,0.13409,0.014553,9.213661,41566,"Chronicles of Narnia: The Lion, the Witch and ...",Adventure|Children|Fantasy,Chronicles of Narnia The Lion the Witch and th...
11606,0.124002,0.014864,8.342407,53125,Pirates of the Caribbean: At World's End (2007),Action|Adventure|Comedy|Fantasy,Pirates of the Caribbean At Worlds End 2007


In [27]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [28]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()