# User Based Collaborative Filtering Recommender System

### Relevant Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv("data/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.shape

(62423, 3)

### Clean data

In [4]:
# function to clean the movie titles by removing special charcaters
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

# applying the cleaning function
movies["clean_title"] = movies["title"].apply(clean_title)

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


### Search engine

In [6]:
# Build the search engine with a term frequency matrix
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [7]:
# Function to find movie based on similarity between the search term
def search(title):
    title = clean_title(title)
    # convert the search term into a set of numbers
    query_vec = vectorizer.transform([title])

    # find similarity between the search term and all titles
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    # find the five most similar titles
    indices = np.argpartition(similarity, -5)[-5:]

    # Get the result rows, in descending order
    results = movies.iloc[indices][::-1]
    
    return results

### Interactive search box

In [8]:
import ipywidgets as widgets
from IPython.display import display
movie_input = widgets.Text(
    # value="Toy Story",
    description="Movie Title:",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        display(data)
        title = data["new"]
        if len(title) > 5:
            display(search(title))
            
movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

### Recommend similar movies

In [9]:
# Read in the ratings data
ratings = pd.read_csv("data/ratings.csv")

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [11]:
ratings.shape

(25000095, 4)

In [12]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [13]:
movie_id = 1

In [14]:
# Gets users who liked the searched term movie and rated it 5/5
# They represent users who probably have similar taste in movies to you
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4 )]["userId"].unique()

In [15]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [16]:
# Get other movies the similar_users like, and rated 5
similar_users_movies = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [17]:
similar_users_movies

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [18]:
# Calculates the percentage of users that liked each movie
similar_users_score = similar_users_movies.value_counts() / len(similar_users)

# Gets the movies, atleast 10 percent of similar_users like.
similar_users_score = similar_users_score[similar_users_score > .1]

In [19]:
similar_users_score

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

In [20]:
# Find how much all the users in our data like these movies
all_users = ratings[(ratings["movieId"].isin(similar_users_score.index)) & (ratings["rating"] > 4)]
all_users_score = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [21]:
all_users_score

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

#### Recommendation score

To calculate the recommendation score, we calculate movies where the difference or ratio between the similar users score and all users score is more.

In [22]:
recommended_movies_scores = pd.concat([similar_users_score, all_users_score], axis=1)
recommended_movies_scores.columns = ["similar", "all"]

In [23]:
recommended_movies_scores.head()

Unnamed: 0,similar,all
1,1.0,0.124728
318,0.445607,0.34222
260,0.40377,0.222207
356,0.370215,0.235266
296,0.367295,0.284674


In [24]:
recommended_movies_scores["recommendation_score"] = recommended_movies_scores["similar"] / recommended_movies_scores["all"]

In [25]:
recommended_movies_scores = recommended_movies_scores.sort_values("recommendation_score", ascending=False)

In [26]:
recommended_movies_scores

Unnamed: 0,similar,all,recommendation_score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


#### Get the top 10 recommendation

In [27]:
recommended_movies_scores.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,recommendation_score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


### Recommendation function

In [28]:
def find_similar_movies(movie_id):
    # Gets users who liked the searched term movie and rated it = 5
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4 )]["userId"].unique()
    
    # Get other movies these users like, and rated 5
    similar_users_movies = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # Calculates the percentage of users that liked each movie
    similar_users_score = similar_users_movies.value_counts() / len(similar_users)
    # Amongst these movies, gets the movies atleast 10 percent of the similar_users like.
    similar_users_score = similar_users_score[similar_users_score > .1]
    
    # Find how common the reccomnedations are among all the users
    all_users = ratings[(ratings["movieId"].isin(similar_users_score.index)) & (ratings["rating"] > 4)]
    all_users_score = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # Create recommendation score
    recommended_movies_scores = pd.concat([similar_users_score, all_users_score], axis=1)
    recommended_movies_scores.columns = ["similar", "all"]
    
    # To get the recommendation score, we calculate movies where the difference or ratio between the similar user and all users are more
    recommended_movies_scores["score"] = recommended_movies_scores["similar"] / recommended_movies_scores["all"]
    recommended_movies_scores = recommended_movies_scores.sort_values("score", ascending=False)
    
    return recommended_movies_scores.head(10).merge(movies, left_index=True, right_on="movieId")

In [29]:
movie_name_input = widgets.Text(
    value="Fast and Furious",
    descriptions = "Movie Title:",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

Text(value='Fast and Furious')

Output()