In [4]:
##Import all necessary libraries
#Get the Movie Titles

import pandas as pd
movies = pd.read_csv("movies.csv")

In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [6]:
#Get the Movie Titles
import re
def clean_title(title):
   return re.sub("^a-zA-Z0-9]","",title)

In [7]:
#using pandas apply method to call the function
#takes The title colum and goes through each item in the colum and then pass them to the clean_title function
movies["clean_title"] = movies["title"].apply(clean_title)

In [8]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995)
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995)
...,...,...,...,...
62418,209157,We (2018),Drama,We (2018)
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul (2001)
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems (2018)
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing (2001)


In [9]:
#using python machine learning libary 
#turning titles into numbers
#using vectorizer to turn the set of titles to a set of number (matrix)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [10]:
pip install -U scikit-learn scipy matplotlib

Note: you may need to restart the kernel to use updated packages.


In [90]:
#creating a search engine 
#computing the similarity between a term we enter


from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [49]:
#constructing the interactive search box
#using display function to show diffrent as outputs


import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

#displaying the movie output
#Searches the data set and puts in the output

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

#displaying both of the output

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [12]:
#reading the ratings.csv file

ratings = pd.read_csv("ratings.csv")

In [13]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [14]:
#data type of ratings

ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [15]:
movie_id = 1

In [16]:
#finding users who likes the same movie
#displaying only unique userid

similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [17]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [25]:
#other movies that the user liked
#any movie that the user rated 4 star


similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [26]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [28]:
# 10 percent of the users that liked the same movie
# value_counts counts how many times each movie appears in the data set

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [29]:
similar_user_recs

Series([], Name: movieId, dtype: float64)

In [30]:
#finding user that rated a movie that is in the set of recomdations
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [31]:
#finding what percentage all users recommended the movies
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [32]:
all_users_recs

Series([], Name: movieId, dtype: float64)

In [35]:
#comparing the percentages using the panda concat method to combine them together 


rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar","all"]

In [34]:
rec_percentages

Unnamed: 0,similar,all


In [36]:
#creating a score which divedes by each other

rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [38]:
#sorting the recommendations using pandas sort value method
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [39]:
rec_percentages

Unnamed: 0,similar,all,score


In [50]:
#acquiring the top 10 reccomdations and then merge it with the movie data 


rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title


In [42]:
#main reccomendation function
#finding users similar to the movie entered 
#finding all of the users and their recommendations
#creating the score and sorting it
#returns the merged score

def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [48]:
#interactive reccommendation widget


#the input widget
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()