In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
import ipywidgets as widgets
from IPython.display import display

movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [5]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [6]:
# adds clean_title column with the cleaned moved names from title column
movies["clean_title"] = movies["title"].apply(clean_title)

In [7]:
#gives of us vector from comparision matrix on movie titles
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
def search(input):
    query_vec = vectorizer.transform([input])
    #similarity gives back value of input title compared to list (movie array)
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indicies = np.argpartition(similarity, -5)[-5:] #gives indicies of 5 best matches
    results = movies.iloc[indicies][::-1] #retrievies indices and reverses the order

    return results

In [9]:
movie_input = widgets.Text(
    value= "Toy Story",
    description= "Movie Title:",
    disabled= False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data['new']
        if len(title)>5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [13]:
movie_id = 1
movie = movies[movies["movieId"] == movie_id]

In [18]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"]>=4)]["userId"].unique()
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >=4)]["movieId"]

In [19]:
similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs>.1]

In [22]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >4)]

In [23]:
all_user_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())

In [26]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
#rec_percentages

In [14]:
rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [28]:
rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
#rec_percentages

In [34]:
#rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

In [36]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [42]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Similar Titles:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Similar Titles:')

Output()