In [None]:
import pandas as pd

In [10]:
movies = pd.read_csv('/content/movies.csv')

In [11]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [12]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [13]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [14]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

In [17]:
# pip install ipywidgets
#jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [18]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [22]:
Text (value='Toy Story', description='Movie Title:')
Output()

NameError: ignored

In [20]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [23]:
ratings = pd.read_csv('/content/ratings.csv')

In [24]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
209842,1481,586,2.5,1357831772
209843,1481,587,3.0,1358430669
209844,1481,588,1.0,1357831561
209845,1481,589,3.5,1357831524


In [25]:
ratings. dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [26]:
movie_id = 1

In [27]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()

In [28]:
similar_users

array([   3,    5,    8,   12,   13,   36,   43,   50,   51,   57,   64,
         75,   77,   82,   86,   90,   93,   95,   96,   98,  109,  110,
        111,  120,  125,  127,  132,  143,  147,  152,  158,  160,  162,
        166,  167,  171,  175,  186,  188,  200,  211,  216,  217,  221,
        227,  229,  230,  233,  235,  236,  249,  256,  257,  259,  261,
        265,  297,  298,  302,  304,  312,  323,  329,  340,  350,  354,
        355,  358,  359,  364,  368,  369,  371,  372,  381,  386,  392,
        396,  402,  405,  409,  411,  414,  421,  422,  424,  428,  435,
        436,  437,  439,  446,  447,  449,  459,  468,  469,  477,  484,
        495,  497,  502,  508,  513,  519,  531,  537,  540,  541,  543,
        548,  551,  553,  561,  567,  572,  573,  580,  581,  582,  592,
        593,  597,  601,  606,  607,  609,  611,  623,  624,  626,  627,
        628,  631,  636,  638,  644,  649,  653,  654,  670,  676,  680,
        683,  686,  694,  695,  697,  700,  702,  7

In [29]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [30]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [31]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [32]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [33]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [34]:
rec_percentages

Unnamed: 0,similar,all
1,0.525223,0.128727
318,0.409496,0.341091
260,0.359050,0.210909
296,0.332344,0.285818
593,0.320475,0.237091
...,...,...
3147,0.103858,0.078545
1219,0.103858,0.053818
1225,0.103858,0.058182
1259,0.100890,0.044364


In [35]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [36]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [37]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,0.525223,0.128727,4.080119,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.151335,0.048727,3.105762,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
1255,0.11276,0.042909,2.627873,1288,This Is Spinal Tap (1984),Comedy,This Is Spinal Tap 1984
4780,0.163205,0.068364,2.387303,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
1120,0.103858,0.044364,2.341052,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,Wallace Gromit The Wrong Trousers 1993
1047,0.109792,0.048,2.287339,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
1226,0.10089,0.044364,2.274165,1259,Stand by Me (1986),Adventure|Drama,Stand by Me 1986
8246,0.142433,0.064727,2.200513,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
898,0.130564,0.059636,2.189332,919,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical,Wizard of Oz The 1939
1273,0.109792,0.050909,2.156634,1307,When Harry Met Sally... (1989),Comedy|Romance,When Harry Met Sally 1989


In [38]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [39]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()