In [1]:
import pandas as pd

movies = pd.read_csv('ml-25m/movies.csv')

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
import re

#Clean the title to leave only the name and year with no speacial characters
def clean_title(title):
    #Search through the title and remove all characters not in the categories assigned
    return re.sub("[^a-zA-Z0-9 ]","", title)

# The name is more important if no year is listed, otherwise the year plays a big factor

In [4]:
#Clean title with no special Characters
movies['clean_title'] = movies['title'].apply(clean_title)
#Clean title with no Year
movies['clean_title_noY'] = movies['clean_title'].str[:-4]

In [5]:
movies

Unnamed: 0,movieId,title,genres,clean_title,clean_title_noY
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995,Father of the Bride Part II
...,...,...,...,...,...
62418,209157,We (2018),Drama,We 2018,We
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001,Window of the Soul
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018,Bad Poems
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001,A Girl Thing


In [6]:
#Create a term frequency matrix 
from sklearn.feature_extraction.text import TfidfVectorizer

#Check for unigrams and bigrams (one word and two word sequences)
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf_name_year = vectorizer.fit_transform(movies['clean_title'])
tfidf_name = vectorizer.fit_transform(movies['clean_title_noY'])

In [7]:

#name = 'Toy Story 2010'

#q = vectorizer.transform([name])
#print(q.shape)
#print('-----')
#print(tfidf_name_year.shape)
#
#print(sim)
#print(movies['title'][14814])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#this is our search function, it will return the results of the cosine similarity (the distance between) the frequency matrix 
#when compared to our input
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    #if the input title's last two chracters are numbers, it is likely that a year has been inputted
    #so we will handle the year as part of the name
    #if title[-2:].isdigit():
        
        #print(title)
        #print(query_vec.shape)
        #print(tfidf_name_year.shape)
        #similarity = cosine_similarity(query_vec, tfidf_name_year).flatten()
    #else:
    similarity = cosine_similarity(query_vec, tfidf_name).flatten()
    #argpartition the array such that at the -5th spot we have the largest 5 values in increasing order, then we will slice off only these 5 values
    indeces = np.argpartition(similarity,-5)[-5:]
    #flip the indeces backwards such that the largest value is in the 0th position and descends
    indeces = indeces[::-1]
    #locate the movies corresponding to the indeces
    result = movies.iloc[indeces]
    return result
        

In [9]:
import ipywidgets as widgets
from IPython.display import display

#input widget
movie_input = widgets.Text(
    value="Toy Story",
    description = "Movie Title:",
    disabled=False
)

#output widget
movie_list = widgets.Output()

def on_type(data):
    #with the output widget, clear it
    with movie_list:
        movie_list.clear_output()
        #grab the most recent value in the input widget
        title = data["new"]
        if len(title)>=5:
            #dispaly the titles with the output widget
            display(search(title))
        
#Check the value event
movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [10]:
#Look at the ratings to recommend similar movies
ratings = pd.read_csv('ml-25m/ratings.csv')

ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [11]:

movie_id = 1
#Find people that have seen the movie we are interested in and have rated it 4 stars and above
#Unique ensures users only appear once on the list
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >4)]['userId'].unique()

In [12]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [13]:
similar_user_recommendations = ratings[(ratings['userId'].isin(similar_users) & (ratings['rating'] >4))]['movieId']
similar_user_recommendations

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [14]:
#Find the movies that more than 10% of this user base likes
similar_user_recommendations = similar_user_recommendations.value_counts() / len(similar_users)

similar_user_recommendations = similar_user_recommendations[similar_user_recommendations > 0.1]

similar_user_recommendations

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

In [15]:
#Need to define a niche ie ensure that these movies are not generally liked, but liked disproportionally within our subgroup
all_users = ratings[(ratings['movieId'].isin(similar_user_recommendations.index)) & (ratings['rating'] >4)]

In [16]:
all_user_recommendations = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

In [17]:
all_user_recommendations

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

In [18]:
recommendation_percentages = pd.concat([similar_user_recommendations, all_user_recommendations], axis=1)
recommendation_percentages.columns=["similar","all"]

In [19]:
recommendation_percentages

Unnamed: 0,similar,all
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [20]:
recommendation_percentages['score'] = recommendation_percentages['similar'] / recommendation_percentages['all']
recommendation_percentages = recommendation_percentages.sort_values('score', ascending=False)
recommendation_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [21]:
#Merge the recommendation percentages with the movies with the left index corresponding to the movieId
recommendation_percentages.head(10).merge(movies, left_index=True, right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title,clean_title_noY
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,Toy Story
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999,Toy Story 2
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998,Bugs Life A
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010,Toy Story 3
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001,Monsters Inc
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992,Aladdin
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003,Finding Nemo
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991,Beauty and the Beast
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004,Incredibles The
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994,Lion King The


In [22]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >4)]['userId'].unique()
    similar_user_recommendations = ratings[(ratings['userId'].isin(similar_users) & (ratings['rating'] >4))]['movieId']
    
    similar_user_recommendations = similar_user_recommendations.value_counts() / len(similar_users)
    similar_user_recommendations = similar_user_recommendations[similar_user_recommendations > 0.1]
    
    all_users = ratings[(ratings['movieId'].isin(similar_user_recommendations.index)) & (ratings['rating'] >4)]
    all_user_recommendations = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
    
    recommendation_percentages = pd.concat([similar_user_recommendations, all_user_recommendations], axis=1)
    recommendation_percentages.columns=["similar","all"]
    
    recommendation_percentages['score'] = recommendation_percentages['similar'] / recommendation_percentages['all']
    recommendation_percentages = recommendation_percentages.sort_values('score', ascending=False)
    
    return recommendation_percentages.head(10).merge(movies, left_index=True, right_on='movieId')[['score','title','genres']]

In [23]:
#Building the Widget
movie_name_input = widgets.Text(
    value='Toy Story',
    description= 'Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title= data['new']
        if len(title)>=5:
            result=search(title)
            #Get the first row aka most confident movie we think you are searching for
            movie_id = result.iloc[0]['movieId']
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names= 'value')
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()