# Content Based Recommender System

In [1]:
# Libraries
from math import sqrt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Download data (run once)
#!wget -O moviedataset.zip https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
#!unzip -o -j moviedataset.zip 

In [3]:
# Store movies and users ratings into df
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [4]:
print(movies_df.shape)
print(ratings_df.shape)

(34208, 3)
(22884377, 4)


In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Clean title col and add a yr col
movies_df['year'] = movies_df.title.str.extract(r'(\d{4})',expand=False)
movies_df['title'] = movies_df.title.str.replace(r'(\(\d{4}\))','')

In [7]:
movies_df.title[0][-1].isspace()

True

In [8]:
# Remove that last whitespace
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [9]:
# Split the genres
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [10]:
# a new df for onehot encoding by genre
movies_with_genres_df = movies_df.copy()

for index,row in movies_df.iterrows():
    for genre in row['genres']:
        movies_with_genres_df.at[index,genre] = 1
        
#fill NaN w 0's
movies_with_genres_df = movies_with_genres_df.fillna(0)
movies_with_genres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Lets see rating
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [12]:
# we dont need the timestamp col
ratings_df = ratings_df.drop('timestamp',1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [17]:
ratings_df.rating.min()

0.5

In [26]:
# Lets supose an user rating
user_input = [
    {'title':'Toy Story','rating':4.5},
    {'title':'Pocahontas','rating':2},
    {'title':'Jumanji','rating':3},
    {'title':'Balto','rating':4},
    {'title':'Pulp Fiction','rating':5}
]
input_movies = pd.DataFrame(user_input)
input_movies

Unnamed: 0,title,rating
0,Toy Story,4.5
1,Pocahontas,2.0
2,Jumanji,3.0
3,Balto,4.0
4,Pulp Fiction,5.0


In [27]:
# Now add movie id to the input movies
input_id = movies_df[movies_df['title'].isin(input_movies['title'].tolist())]
input_movies = pd.merge(input_id,input_movies)
# Drop unnecesary cols
input_movies = input_movies.drop('genres',1).drop('year',1)
input_movies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,4.5
1,2,Jumanji,3.0
2,13,Balto,4.0
3,48,Pocahontas,2.0
4,296,Pulp Fiction,5.0


In [28]:
# Now we're going to filter the movieswithgenre from the input
user_movies = movies_with_genres_df[movies_with_genres_df['movieId'].isin(input_movies['movieId'].tolist())]
user_movies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,13,Balto,"[Adventure, Animation, Children]",1995,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,48,Pocahontas,"[Animation, Children, Drama, Musical, Romance]",1995,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
293,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Actually we just need the matix of genres so lets drop the other cols
# but first... resseting the index
user_movies = user_movies.reset_index(drop=True)
user_genre_table = user_movies.drop('movieId',1).drop('title',1).drop('genres',1).drop('year',1)
user_genre_table

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
input_movies['rating']

0    4.5
1    3.0
2    4.0
3    2.0
4    5.0
Name: rating, dtype: float64

In [32]:
print(user_genre_table.shape)
print(user_genre_table.transpose().shape)
print(input_movies['rating'].shape)

(5, 20)
(20, 5)
(5,)


In [33]:
# To start learning the users preferences based on its rated movies
# we multiply user_genre_table by input_movies
user_profile = user_genre_table.transpose().dot(input_movies['rating'])
user_profile

Adventure             11.5
Animation             10.5
Children              13.5
Comedy                 9.5
Fantasy                7.5
Romance                2.0
Drama                  7.0
Action                 0.0
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 0.0
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                2.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [34]:
# Now we have the weights for every of the user's preferences (user profile)
# w/this we can recommend new movies that satisy user's preferences
#
# Now lets get the genres of every movie
genre_table = movies_with_genres_df.set_index(movies_with_genres_df['movieId'])
genre_table

Unnamed: 0_level_0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151697,151697,Grand Slam,[Thriller],1967,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151701,151701,Bloodmoney,[(no genres listed)],2010,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
151703,151703,The Butterfly Circus,[Drama],2009,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151709,151709,Zero,"[Drama, Sci-Fi]",2015,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# drop the unnecessary cols
genre_table = genre_table.drop('movieId',1).drop('title',1).drop('genres',1).drop('year',1)
genre_table

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
151703,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151709,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
print(user_profile.shape)
print(genre_table.shape)

(20,)
(34208, 20)


In [42]:
# with the users profile and the genres table for every movie, 
# we are going to take the weightes average of every movie
# based on the usersprofile and finally, recommend movies that most satisfy it
recommendation_table_df = ((genre_table*user_profile).sum(axis=1))/(user_profile.sum())
recommendation_table_df.head()

movieId
1    0.714286
2    0.442177
3    0.156463
4    0.251701
5    0.129252
dtype: float64

In [50]:
# lo anterior es un puntaje de qué tan afin es la pelicula a los gusto del usuario
# ahora para el top 20
recommendation_table_df = recommendation_table_df.sort_values(ascending=False)
recommendation_table_df.head()

movieId
26093    0.863946
2987     0.782313
56152    0.768707
1907     0.761905
84637    0.741497
dtype: float64

In [51]:
recommendation_table_df.head(20).keys()

Int64Index([ 26093,   2987,  56152,   1907,  84637,   4306,   8444,  92348,
            108540,  53121,  33463, 136361,  26340, 108932, 103755,  32031,
             36397,  51939,  78637,  78499],
           dtype='int64', name='movieId')

In [52]:
# the final recommendation table
movies_df.loc[movies_df['movieId'].isin(recommendation_table_df.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
1824,1907,Mulan,"[Adventure, Animation, Children, Comedy, Drama...",1998
2902,2987,Who Framed Roger Rabbit?,"[Adventure, Animation, Children, Comedy, Crime...",1988
4212,4306,Shrek,"[Adventure, Animation, Children, Comedy, Fanta...",2001
7815,8444,"Chipmunk Adventure, The","[Adventure, Animation, Children, Comedy, Fanta...",1987
8605,26093,"Wonderful World of the Brothers Grimm, The","[Adventure, Animation, Children, Comedy, Drama...",1962
8783,26340,"Twelve Tasks of Asterix, The (Les douze travau...","[Action, Adventure, Animation, Children, Comed...",1976
9825,32031,Robots,"[Adventure, Animation, Children, Comedy, Fanta...",2005
10120,33463,DuckTales: The Movie - Treasure of the Lost Lamp,"[Adventure, Animation, Children, Comedy, Fantasy]",1990
10375,36397,Valiant,"[Adventure, Animation, Children, Comedy, Fanta...",2005
11751,51939,TMNT (Teenage Mutant Ninja Turtles),"[Action, Adventure, Animation, Children, Comed...",2007


In [53]:
# netflix n chill :D

In [62]:
def preprocessing_movies(df):
    '''
    ABLE TO PREPROCESS DATAFRAME
    df: dataframe only admit movies_df or ratings_df
    '''
    # Clean
    df['year'] = df.title.str.extract(r'(\d{4})',expand=False)
    df['title'] = df.title.str.replace(r'(\(\d{4}\))','')
    df['title'] = df['title'].apply(lambda x: x.strip())
    df['genres'] = df.genres.str.split('|')
    return df
    
def preprocessing_ratings(df):
    # Remove timestap 
    df = df.drop('timestamp',1)
    return df

def genres_ohe(df):
    '''
    ABLE TO CONVERT movies_df TO A DATAFRAME WITH GENDER ONE HOT ENCODING
    df: dataframe. Only admit movies_df
    '''
    # Genres One Hot Encoding
    movies_with_genres_df = df.copy()
    for index,row in df.iterrows():
        for genre in row['genres']:
            movies_with_genres_df.at[index,genre] = 1
    # Fill NaN with 0
    movies_with_genres_df = movies_with_genres_df.fillna(0)    
    return movies_with_genres_df

def avg_weight(genre_table,user_profile):
    '''
    ABLE TO WEIGHT THE MOVIES BASED ON USER'S PROFILE
    '''
    recommendation_table_df = ((genre_table*user_profile).sum(axis=1))/(user_profile.sum())
    return recommendation_table_df.sort_values(ascending=False)

def recommend(user_input, top_n=10, movies_catalog='movies.csv', ratings_catalog='ratings.csv'):
    '''
    ABLE TO RECOMMEND TOP N MOVIES BASED ON USERS PROFILE
    
    user_input: a list with dictionaries as elements as following [{'title':'','rating':},...]
                where title value is str and rating value is float (min=0.5,max=5,step=0.5)
    top_n: int
    movies_catalog: file path. str
    ratings_catalog: file path. str
    '''
    # Store movies and users ratings into df
    movies_df = pd.read_csv('movies.csv')
    ratings_df = pd.read_csv('ratings.csv')
    
    # clean df
    movies_df = preprocessing_movies(movies_df)
    ratings_df = preprocessing_ratings(ratings_df)
    movies_with_genres_df = genres_ohe(movies_df)
    
    # input to df
    input_movies = pd.DataFrame(user_input)
    
    # Now add movie id to the input movies
    input_id = movies_df[movies_df['title'].isin(input_movies['title'].tolist())]
    input_movies = pd.merge(input_id,input_movies)
    input_movies = input_movies.drop('genres',1).drop('year',1)
    
    # Now we're going to filter the movieswithgenre from the input to get user_movies
    user_movies = movies_with_genres_df[movies_with_genres_df['movieId'].isin(input_movies['movieId'].tolist())]
    user_movies = user_movies.reset_index(drop=True)
    user_genre_table = user_movies.drop('movieId',1).drop('title',1).drop('genres',1).drop('year',1)
    
    # To start learning the users preferences based on its rated movies
    user_profile = user_genre_table.transpose().dot(input_movies['rating'])
    
    # Now lets get the genres of every movie
    genre_table = movies_with_genres_df.set_index(movies_with_genres_df['movieId'])
    genre_table = genre_table.drop('movieId',1).drop('title',1).drop('genres',1).drop('year',1)
    
    # we are going to take the weightes average of every movie based on the usersprofile 
    recommendation_table_df = avg_weight(genre_table,user_profile)
    
    # the final recommendation table
    return movies_df.loc[movies_df['movieId'].isin(recommendation_table_df.head(top_n).keys())]

In [60]:
# Lets supose an user rating
user_input = [
    {'title':'Toy Story','rating':4.5},
    {'title':'Pocahontas','rating':2},
    {'title':'Jumanji','rating':3},
    {'title':'Balto','rating':4},
    {'title':'Pulp Fiction','rating':5}
]

In [63]:
recommend(user_input,10)

Unnamed: 0,movieId,title,genres,year
1824,1907,Mulan,"[Adventure, Animation, Children, Comedy, Drama...",1998
2902,2987,Who Framed Roger Rabbit?,"[Adventure, Animation, Children, Comedy, Crime...",1988
4212,4306,Shrek,"[Adventure, Animation, Children, Comedy, Fanta...",2001
7815,8444,"Chipmunk Adventure, The","[Adventure, Animation, Children, Comedy, Fanta...",1987
8605,26093,"Wonderful World of the Brothers Grimm, The","[Adventure, Animation, Children, Comedy, Drama...",1962
11883,53121,Shrek the Third,"[Adventure, Animation, Children, Comedy, Fantasy]",2007
12269,56152,Enchanted,"[Adventure, Animation, Children, Comedy, Fanta...",2007
16770,84637,Gnomeo & Juliet,"[Adventure, Animation, Children, Comedy, Fanta...",2011
18587,92348,Puss in Boots (Nagagutsu o haita neko),"[Adventure, Animation, Children, Comedy, Fanta...",1969
22778,108540,Ernest & Célestine (Ernest et Célestine),"[Adventure, Animation, Children, Comedy, Drama...",2012
