In [2]:
import ipywidgets as widgets
from IPython.display import display

slider = widgets.IntSlider(value=50, min=0, max=100, step=1, description='Test Slider:')
display(slider)


IntSlider(value=50, description='Test Slider:')

In [3]:
import pandas as pd

In [4]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [5]:
import re

def clean(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [6]:
movies['clean_title'] = movies["title"].apply(clean)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfid = vectorizer.fit_transform(movies['clean_title'])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfid).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [9]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [10]:
ratings = pd.read_csv('ratings.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [11]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [12]:
movie_id = 150

In [13]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [14]:
similar_users #set of people who liked same movie as us

array([     5,     13,     22, ..., 162528, 162529, 162531],
      shape=(13678,))

In [15]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [16]:
similar_user_recs #set of movies who people have liked same movie as us and have rated it >4

1154         32
1155         36
1157         47
1158         50
1164        141
           ... 
24998412    318
24998413    319
24998415    349
24998416    431
24998418    590
Name: movieId, Length: 840505, dtype: int64

In [17]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [18]:
similar_user_recs

movieId
150     1.000000
318     0.532461
356     0.479164
527     0.380099
593     0.373154
          ...   
1089    0.101404
6539    0.101111
500     0.100526
1208    0.100453
1721    0.100453
Name: count, Length: 84, dtype: float64

In [19]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)] #all people who have watched same movie as in our recs list

In [20]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
79,2,318,5.0,1141417181
81,2,349,4.5,1141417045
...,...,...,...,...
25000020,162541,2959,5.0,1240953488
25000057,162541,4993,5.0,1240952610
25000058,162541,4995,5.0,1240951903
25000065,162541,5952,5.0,1240952617


In [21]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) #how much all people have rated our recs

In [22]:
all_user_recs

movieId
318     0.347998
296     0.289480
2571    0.248153
356     0.239237
593     0.229722
          ...   
539     0.032909
62      0.031555
500     0.031286
349     0.030996
161     0.029562
Name: count, Length: 84, dtype: float64

In [23]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [24]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
150,1.000000,0.092107
318,0.532461,0.347998
356,0.479164,0.239237
527,0.380099,0.218840
593,0.373154,0.229722
...,...,...
1089,0.101404,0.102477
6539,0.101111,0.078154
500,0.100526,0.031286
1208,0.100453,0.083905


In [25]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [26]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
150,1.000000,0.092107,10.856924
161,0.143003,0.029562,4.837390
349,0.131306,0.030996,4.236158
590,0.248428,0.071272,3.485622
508,0.116318,0.034545,3.367128
...,...,...,...
4226,0.131086,0.134181,0.976938
2858,0.166106,0.170463,0.974438
58559,0.122313,0.148942,0.821215
79132,0.106960,0.133602,0.800589


In [27]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
148,1.0,0.092107,10.856924,150,Apollo 13 (1995),Adventure|Drama|IMAX,Apollo 13 1995
159,0.143003,0.029562,4.83739,161,Crimson Tide (1995),Drama|Thriller|War,Crimson Tide 1995
344,0.131306,0.030996,4.236158,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller,Clear and Present Danger 1994
582,0.248428,0.071272,3.485622,590,Dances with Wolves (1990),Adventure|Drama|Western,Dances with Wolves 1990
503,0.116318,0.034545,3.367128,508,Philadelphia (1993),Drama,Philadelphia 1993
61,0.10484,0.031555,3.322413,62,Mr. Holland's Opus (1995),Drama,Mr Hollands Opus 1995
534,0.108642,0.032909,3.301287,539,Sleepless in Seattle (1993),Comedy|Drama|Romance,Sleepless in Seattle 1993
375,0.143808,0.04406,3.26388,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,True Lies 1994
372,0.132695,0.041003,3.236216,377,Speed (1994),Action|Romance|Thriller,Speed 1994
452,0.320807,0.09944,3.226124,457,"Fugitive, The (1993)",Thriller,Fugitive The 1993


In [28]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [30]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()