In [225]:
import pandas as pd

movies = pd.read_csv("movies.csv")

In [226]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [227]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [228]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies['clean_title'])


In [229]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [230]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(value = "",description = "Movie Title: ")

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))
            
movie_input.observe(on_type,names = "value")
display(movie_input,movie_list)

Text(value='', description='Movie Title: ')

Output()

In [231]:
ratings = pd.read_csv("ratings.csv")

ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [234]:
movie_id = 62422

similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings["rating"] > 4)]['userId'].unique()

similar_users


array([], dtype=int64)

In [235]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings['rating'] > 4)]["movieId"]

In [236]:
similar_user_recs

Series([], Name: movieId, dtype: int64)

In [237]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [238]:
similar_user_recs

Series([], Name: count, dtype: float64)

In [239]:
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]

all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

In [240]:
all_user_recs

Series([], Name: count, dtype: float64)

In [241]:
rec_percentages = pd.concat([similar_user_recs,all_user_recs],axis = 1)
rec_percentages.columns = ["similar","all"]
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

rec_percentages = rec_percentages.sort_values("score",ascending = False)

rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [242]:
rec_percentages.head(10).merge(movies,on = "movieId")

Unnamed: 0,movieId,similar,all,score,title,genres,clean_title


In [232]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings["rating"] > 4)]['userId'].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings['rating'] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
   
    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
    all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

    rec_percentages = pd.concat([similar_user_recs,all_user_recs],axis = 1)
    rec_percentages.columns = ["similar","all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score",ascending = False)
    return rec_percentages.head(10).merge(movies,on = "movieId")[["score","title","genres"]]

In [233]:
movie_name_input = widgets.Text( value="",description = "Movie Title: ")

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            movies = search(title)
            movieid = movies.iloc[0]["movieId"]
            display(find_similar_movies(movieid))

movie_name_input.observe(on_type,names="value")

display(movie_name_input,recommendation_list)

Text(value='', description='Movie Title: ')

Output()