In [3]:
import pandas as pd

movie = pd.read_csv("/Users/nikhilkamerkar/Downloads/ml-25m/movies.csv")

In [14]:
movie


Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [5]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [6]:
movie.describe()

Unnamed: 0,movieId
count,62423.0
mean,122220.387646
std,63264.744844
min,1.0
25%,82146.5
50%,138022.0
75%,173222.0
max,209171.0


### Creating a function to clean the movie titles for example any "(" or "-" using regular expressions

In [12]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [13]:
movie["clean_title"] = movie["title"].apply(clean_title)

### Creating the tfidf matrix to vectorises the data to make it machine readable

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (1,2))

tfidf = vectorizer.fit_transform(movie["clean_title"])

### Creating a function to search a movie title with cosine similarity 
### Here we use arg partition to partition the similarity array and get the top 5 similar search results

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
#     title = "Toy Story (1995)"
    title = clean_title(title)
    vector = vectorizer.transform([title])
    search = cosine_similarity(vector, tfidf).flatten()
    index = np.argpartition(search, -5)[-5:]
    search_movie = movie.iloc[index][::-1]
    return search_movie

In [40]:
import ipywidgets as widgets
from IPython.display import display

movie_title = widgets.Text(
#     default = "Toy Story",
    description = "Enter Movie Name",
    disabled = False
)

movie_list_display = widgets.Output()

def enter_movie(data):
    with movie_list_display:
        movie_list_display.clear_output()
        display(data)
        title = data["new"]
        if len(title) > 5:
            display(search(title))


movie_title.observe(enter_movie, names='value')
display(movie_title, movie_list_display)

Text(value='', description='Enter Movie Name')

Output()

### Now we will build the recommender system

In [41]:
ratings = pd.read_csv("/Users/nikhilkamerkar/Downloads/ml-25m/ratings.csv")
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [45]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Filter for users with similar rating, filter those users further to find the movies they have rated higher then 4 and then filter it further to only return movies that atleast 10 percent and higher number of users have rated 4 or above

In [47]:
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530])

In [58]:
similar_user_viewed = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [59]:
similar_user_viewed = similar_user_viewed.value_counts()/ len(similar_users)
similar_user_viewed = similar_user_viewed[similar_user_viewed > .1]
similar_user_viewed

1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: movieId, Length: 92, dtype: float64

In [63]:
all_users = ratings[(ratings["movieId"].isin(similar_user_viewed.index)) & (ratings["rating"] > 4)]
all_user_viewed = all_users["movieId"].value_counts()/ len(all_users["userId"].unique())
# all_user_viewed = all_user_viewed[all_user_viewed > .1]
all_user_viewed

318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: movieId, Length: 92, dtype: float64

### Concat and find the ratio of similar users and all users

In [64]:
percentages = pd.concat([similar_user_viewed,all_user_viewed], axis=1)
percentages.columns = ["similar", "all"]

In [65]:
percentages

Unnamed: 0,similar,all
1,1.000000,0.125844
318,0.414556,0.345282
260,0.404561,0.224195
356,0.347253,0.237370
296,0.342663,0.287220
...,...,...
1259,0.102991,0.049349
7361,0.101881,0.105172
1206,0.101362,0.087500
1307,0.101066,0.046195


In [67]:
percentages["score"] = percentages["similar"]/ percentages["all"]
percentages = percentages.sort_values("score", ascending = False)
percentages

Unnamed: 0,similar,all,score
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


### Take the first 10 entries and merge with the movie df on the right with reference to the index

In [69]:
percentages.head(10).merge(movie, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125844,7.946323,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.295498,0.054186,5.453383,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.124685,0.025316,4.925186,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.138161,0.035445,3.897906,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.233674,0.068117,3.43048,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.198949,0.060514,3.287671,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.158226,0.052696,3.002602,34,Babe (1995),Children|Drama,Babe 1995
4780,0.210647,0.071444,2.94841,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
1047,0.143418,0.049202,2.914882,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
729,0.108322,0.037362,2.899227,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995


### Creating a function for the recommender engine

In [74]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_viewed = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_viewed = similar_user_viewed.value_counts()/ len(similar_users)
    similar_user_viewed = similar_user_viewed[similar_user_viewed > .1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_viewed.index)) & (ratings["rating"] > 4)]
    all_user_viewed = all_users["movieId"].value_counts()/ len(all_users["userId"].unique())
    
    percentages = pd.concat([similar_user_viewed,all_user_viewed], axis=1)
    percentages.columns = ["similar", "all"]
    
    percentages["score"] = percentages["similar"]/ percentages["all"]
    percentages = percentages.sort_values("score", ascending = False)
    
    return percentages.head(10).merge(movie, left_index=True, right_on="movieId")

In [75]:
movie_name_input = widgets.Text(
    value= "Toy Story",
    description = "Enter Movie Name",
    disabled = False
)

recommended_list = widgets.Output()

def enter_movie(data):
    with recommended_list:
        recommended_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))


movie_name_input.observe(enter_movie, names='value')
display(movie_name_input, recommended_list)



Text(value='Toy Story', description='Enter Movie Name')

Output()