# Movie Recommendation

## 1. Data Collection

In [2]:
import pandas as pd
movies = pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings = pd.read_csv("ratings.csv")

In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


## 2.Data preprocessing

In [6]:
import re

In [7]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)
    

In [8]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [9]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017
9739,193585,Flint (2017),Drama,Flint 2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018


## 3.User-Item Matrix

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

title = "Toy Story 1995"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()

In [12]:
similarity

array([1.        , 0.09169835, 0.05977384, ..., 0.        , 0.        ,
       0.        ])

In [13]:
query_vec

<1x33421 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [14]:
indices = np.argpartition(similarity, -5)[-5:]

In [15]:
indices

array([3595,  256, 2355, 7355,    0], dtype=int64)

In [16]:
results = movies.iloc[indices]

In [17]:
results

Unnamed: 0,movieId,title,genres,clean_title
3595,4929,"Toy, The (1982)",Comedy,Toy The 1982
256,295,"Pyromaniac's Love Story, A (1995)",Comedy|Romance,Pyromaniacs Love Story A 1995
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
7355,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


In [18]:
results = movies.iloc[indices][::-1]
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
7355,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
256,295,"Pyromaniac's Love Story, A (1995)",Comedy|Romance,Pyromaniacs Love Story A 1995
3595,4929,"Toy, The (1982)",Comedy,Toy The 1982


In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [20]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="",
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [21]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [22]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

## 4.Top Recommendations

In [23]:
movie_id = 1

In [24]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()

In [25]:
similar_users

array([  1,   5,   7,  17,  19,  31,  40,  43,  45,  46,  57,  63,  64,
        66,  71,  73,  78,  86,  91,  96,  98, 103, 107, 121, 124, 135,
       137, 141, 145, 151, 156, 159, 160, 161, 166, 169, 171, 177, 178,
       179, 182, 185, 186, 191, 201, 202, 206, 217, 220, 229, 234, 239,
       240, 247, 249, 252, 254, 263, 264, 269, 270, 273, 274, 275, 276,
       277, 280, 282, 288, 290, 291, 292, 304, 307, 328, 330, 332, 336,
       337, 339, 341, 347, 350, 353, 357, 359, 364, 367, 378, 380, 382,
       385, 389, 396, 399, 411, 414, 420, 422, 434, 436, 438, 443, 448,
       451, 453, 456, 460, 468, 469, 470, 471, 474, 476, 477, 483, 484,
       488, 492, 500, 504, 509, 514, 517, 524, 525, 533, 534, 550, 555,
       559, 561, 562, 570, 572, 573, 579, 584, 587, 590, 596, 597, 601,
       603, 605, 607, 610], dtype=int64)

In [26]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [27]:
similar_user_recs

3             47
4             50
6            101
8            151
9            157
           ...  
100821    160527
100829    164179
100832    168248
100833    168250
100834    168252
Name: movieId, Length: 8836, dtype: int64

In [28]:
similar_user_recs.value_counts()

318       71
1         65
356       53
296       52
1196      49
          ..
6059       1
2717       1
2471       1
2375       1
160527     1
Name: movieId, Length: 2442, dtype: int64

In [29]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [30]:
similar_user_recs

318     0.482993
1       0.442177
356     0.360544
296     0.353741
1196    0.333333
          ...   
5618    0.102041
2918    0.102041
1246    0.102041
1307    0.102041
923     0.102041
Name: movieId, Length: 106, dtype: float64

In [31]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [32]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
3,1,47,5.0,964983815
4,1,50,5.0,964982931
15,1,260,5.0,964981680
25,1,457,5.0,964981909
28,1,527,5.0,964984002
...,...,...,...,...
100125,610,32587,4.5,1493844885
100217,610,48516,5.0,1479542152
100310,610,58559,4.5,1493844688
100326,610,60069,4.5,1493844866


In [33]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [34]:
all_user_recs

318      0.357522
296      0.295575
356      0.274336
2571     0.265487
2959     0.230088
           ...   
953      0.046018
1307     0.046018
1079     0.044248
899      0.038938
78499    0.037168
Name: movieId, Length: 106, dtype: float64

In [35]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [36]:
rec_percentages

Unnamed: 0,similar,all
318,0.482993,0.357522
1,0.442177,0.115044
356,0.360544,0.274336
296,0.353741,0.295575
1196,0.333333,0.191150
...,...,...
5618,0.102041,0.074336
2918,0.102041,0.056637
1246,0.102041,0.063717
1307,0.102041,0.046018


In [37]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [38]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [39]:
rec_percentages

Unnamed: 0,similar,all,score
1,0.442177,0.115044,3.843537
3114,0.183673,0.058407,3.144712
78499,0.108844,0.037168,2.928409
899,0.102041,0.038938,2.620594
1079,0.115646,0.044248,2.613605
...,...,...,...
4973,0.122449,0.104425,1.172605
2959,0.265306,0.230088,1.153061
2571,0.299320,0.265487,1.127438
79132,0.129252,0.122124,1.058365


In [40]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,0.442177,0.115044,3.843537,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
2355,0.183673,0.058407,3.144712,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
7355,0.108844,0.037168,2.928409,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
681,0.102041,0.038938,2.620594,899,Singin' in the Rain (1952),Comedy|Musical|Romance,Singin in the Rain 1952
819,0.115646,0.044248,2.613605,1079,"Fish Called Wanda, A (1988)",Comedy|Crime,Fish Called Wanda A 1988
2038,0.136054,0.053097,2.562358,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi,Ghostbusters aka Ghost Busters 1984
733,0.115646,0.046018,2.513082,953,It's a Wonderful Life (1946),Children|Drama|Fantasy|Romance,Its a Wonderful Life 1946
964,0.210884,0.086726,2.431626,1265,Groundhog Day (1993),Comedy|Fantasy|Romance,Groundhog Day 1993
32,0.122449,0.053097,2.306122,34,Babe (1995),Children|Drama,Babe 1995
4360,0.197279,0.086726,2.274747,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003


## 5.Building a Recommendation Function

In [41]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [42]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value="",
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()