# 1. Content-based Recommender System

## Importing necessary items

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Reading data and storing in DataFrames

In [2]:
movies_df = pd.read_csv('movies.csv')
print('The shape of movies dataframe is:', movies_df.shape)
movies_df.head()

The shape of movies dataframe is: (9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv('ratings.csv')
print('The shape of rating dataframe is:', ratings_df.shape)
ratings_df.head()

The shape of rating dataframe is: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Preprocessing the movies dataframe


In [4]:
# removing year from the titles and storing them in a new column
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
# or movies_df['year'] = movies_df['title'].str.extract(r'(\d{4})',expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
# or movies_df['title'] = movies_df['title'].str.strip()
movies_df.head()

# In the context of the str.extract() method in pandas, the expand=False parameter controls...
# whether the result of the extraction should be returned as a Series (if expand=False) ...
# or a DataFrame (if expand=True or not specified).

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [5]:
# converting the genres column from items seperated by '|' to a list of items
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


### converting the list of genres to a vector: 1 shows that a movie has that genre and 0 shows that it doesn't.

In [6]:
# making a copy from the original movies dataframe
moviesWithGenres_df = movies_df.copy()

for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
#If a movie doesn't have that column's genre Fill in the NaN values with 0 
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preprocessing the ratings dataframe

In [7]:
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [8]:
# removing the timestamp column
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# Content-Based recommendation system

In [9]:
movies_df['title'][30:40]

30                       Dangerous Minds
31    Twelve Monkeys (a.k.a. 12 Monkeys)
32                                  Babe
33                      Dead Man Walking
34                          It Takes Two
35                              Clueless
36              Cry, the Beloved Country
37                           Richard III
38                       Dead Presidents
39                           Restoration
Name: title, dtype: object

In [10]:
# userInput = [
#             {'title':'Breakfast Club, The', 'rating':5},
#             {'title':'Toy Story', 'rating':3.5},
#             {'title':'Jumanji', 'rating':2},
#             {'title':"Pulp Fiction", 'rating':5},
#             {'title':'Akira', 'rating':4.5}
#          ] 
# inputMovies = pd.DataFrame(userInput)
# inputMovies

In [11]:
userInput = [
            {'title':'Dangerous Minds', 'rating':4},
            {'title':'It Takes Two', 'rating':2.5},
            {'title':'Clueless', 'rating':3},
            {'title':"Pulp Fiction", 'rating':2},
            {'title':'Richard III', 'rating':4.5},
            {'title':'Dead Presidents', 'rating':2},
            {'title':'Babe', 'rating':1},
            {'title':'Akira', 'rating':5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Dangerous Minds,4.0
1,It Takes Two,2.5
2,Clueless,3.0
3,Pulp Fiction,2.0
4,Richard III,4.5
5,Dead Presidents,2.0
6,Babe,1.0
7,Akira,5.0


### Adding movieId to inputMovies dataFrame

In [12]:
# Extracting the user input movies data from movies dataframe
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputId

Unnamed: 0,movieId,title,genres,year
30,31,Dangerous Minds,[Drama],1995
32,34,Babe,"[Children, Drama]",1995
34,38,It Takes Two,"[Children, Comedy]",1995
35,39,Clueless,"[Comedy, Romance]",1995
37,41,Richard III,"[Drama, War]",1995
38,42,Dead Presidents,"[Action, Crime, Drama]",1995
257,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994
973,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988


In [13]:
# merging it so we can get the movieId
inputMovies = pd.merge(inputId, inputMovies)
inputMovies

Unnamed: 0,movieId,title,genres,year,rating
0,31,Dangerous Minds,[Drama],1995,4.0
1,34,Babe,"[Children, Drama]",1995,1.0
2,38,It Takes Two,"[Children, Comedy]",1995,2.5
3,39,Clueless,"[Comedy, Romance]",1995,3.0
4,41,Richard III,"[Drama, War]",1995,4.5
5,42,Dead Presidents,"[Action, Crime, Drama]",1995,2.0
6,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,2.0
7,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,5.0


In [14]:
# Dropping genres and year columns
inputMovies = inputMovies.drop('genres', 1).drop('year', 1)
inputMovies

Unnamed: 0,movieId,title,rating
0,31,Dangerous Minds,4.0
1,34,Babe,1.0
2,38,It Takes Two,2.5
3,39,Clueless,3.0
4,41,Richard III,4.5
5,42,Dead Presidents,2.0
6,296,Pulp Fiction,2.0
7,1274,Akira,5.0


In [15]:
# Extracting the user input movies data from moviesWithGenres dataframe
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
30,31,Dangerous Minds,[Drama],1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,34,Babe,"[Children, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,38,It Takes Two,"[Children, Comedy]",1995,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,39,Clueless,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,41,Richard III,"[Drama, War]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
38,42,Dead Presidents,"[Action, Crime, Drama]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
257,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
973,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#Resetting the index
userMovies = userMovies.reset_index(drop=True)
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,31,Dangerous Minds,[Drama],1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,34,Babe,"[Children, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,38,It Takes Two,"[Children, Comedy]",1995,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,Clueless,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,41,Richard III,"[Drama, War]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,42,Dead Presidents,"[Action, Crime, Drama]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Dropping movieId, title, genres and year columns
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Learning the input's preferences!

### Turning each genre into weights


In [18]:
inputMovies['rating']

0    4.0
1    1.0
2    2.5
3    3.0
4    4.5
5    2.0
6    2.0
7    5.0
Name: rating, dtype: float64

In [19]:
#Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
#The user profile
userProfile

Adventure              5.0
Animation              5.0
Children               3.5
Comedy                 7.5
Fantasy                0.0
Romance                3.0
Drama                 13.5
Action                 7.0
Crime                  4.0
Thriller               2.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 5.0
War                    4.5
Musical                0.0
Documentary            0.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [20]:
# extracting the genre table from the original dataframe
moviesWithGenres_df


Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,193585,Flint,[Drama],2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# setting the index of the DataFrame to the 'movieId' column.
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
genreTable

Unnamed: 0_level_0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,193585,Flint,[Drama],2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#And drop the unnecessary information
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
genreTable.shape

(9742, 20)

## taking the weighted average of every movie based on the input profile


In [24]:
#Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

movieId
1    0.350000
2    0.141667
3    0.175000
4    0.400000
5    0.125000
dtype: float64

In [25]:
#Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
#Just a peek at the values
recommendationTable_df.head()

movieId
4719     0.700000
26236    0.675000
81132    0.650000
4956     0.633333
1907     0.625000
dtype: float64

# The recommendation table:


In [26]:
#The final recommendation table
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
1076,1396,Sneakers,"[Action, Comedy, Crime, Drama, Sci-Fi]",1992
1390,1907,Mulan,"[Adventure, Animation, Children, Comedy, Drama...",1998
2174,2890,Three Kings,"[Action, Adventure, Comedy, Drama, War]",1999
3460,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
3608,4956,"Stunt Man, The","[Action, Adventure, Comedy, Drama, Romance, Th...",1980
4005,5657,Flashback,"[Action, Adventure, Comedy, Crime, Drama]",1990
4681,6990,The Great Train Robbery,"[Action, Adventure, Comedy, Crime, Drama]",1978
5476,26236,"White Sun of the Desert, The (Beloe solntse pu...","[Action, Adventure, Comedy, Drama, Romance, War]",1970
5556,26701,Patlabor: The Movie (Kidô keisatsu patorebâ: T...,"[Action, Animation, Crime, Drama, Film-Noir, M...",1989
6094,42015,Casanova,"[Action, Adventure, Comedy, Drama, Romance]",2005


# 1. Collaborative Recommender System

In [27]:
# The same userInput as previous section is used
userInput 

[{'title': 'Dangerous Minds', 'rating': 4},
 {'title': 'It Takes Two', 'rating': 2.5},
 {'title': 'Clueless', 'rating': 3},
 {'title': 'Pulp Fiction', 'rating': 2},
 {'title': 'Richard III', 'rating': 4.5},
 {'title': 'Dead Presidents', 'rating': 2},
 {'title': 'Babe', 'rating': 1},
 {'title': 'Akira', 'rating': 5}]

### Finding the users who has seen the same movies as the user


In [28]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
16,1,296,3.0
261,3,31,0.5
320,4,296,1.0
518,5,34,4.0
520,5,39,3.0


In [29]:
#Group the userSubset by users
userSubsetGroup = userSubset.groupby(['userId'])

In [30]:
#Sorting userSubsetGroup so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[0:2]

[(474,
         userId  movieId  rating
  73109     474       31     3.0
  73111     474       34     4.5
  73113     474       38     1.0
  73114     474       39     3.5
  73115     474       41     3.5
  73172     474      296     4.0
  73466     474     1274     2.0),
 (372,
         userId  movieId  rating
  56137     372       31     2.0
  56139     372       34     4.0
  56141     372       39     3.0
  56142     372       41     4.0
  56180     372      296     4.0
  56313     372     1274     1.0)]

In [31]:
userSubsetGroup = userSubsetGroup[0:100]
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [32]:
pearsonCorrelationDict.items()

dict_items([(474, -0.41643997118104653), (372, -0.6668859288553503), (414, -0.25182262437346997), (599, -0.09644670050761485), (117, 0.22315184957216902), (608, -0.1889822365046137), (6, 0.13171584244345805), (68, 0.8907337387831413), (91, 0.33092473234361874), (109, -0.2581988897471611), (181, -0.5129891760425771), (240, 0.0), (314, 0.8268106308031118), (387, -0.1472642081021448), (434, 0.4023739080814782), (470, -0.7255892438417318), (480, -0.09759000729485331), (561, -0.20459830184114206), (600, 0.3281650616569468), (603, -0.7475319302283772), (5, -0.5), (8, -1.0), (33, 0.866025403784439), (40, -0.9449111825230633), (57, 0), (58, -0.8660254037844355), (64, -0.8660254037844448), (84, -0.9449111825230703), (94, -0.8660254037844387), (95, 0.8660254037844402), (132, 0.5960395606792697), (144, 0.5), (156, -0.7205766921228921), (174, 0.0), (177, -1.0), (182, -0.3592106040535473), (191, -0.7205766921228921), (198, -0.277350098112614), (200, 0.8660254037844388), (217, -0.9449111825230678), 

In [33]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,-0.41644,474
1,-0.666886,372
2,-0.251823,414
3,-0.096447,599
4,0.223152,117


In [34]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
81,1.0,111
91,1.0,162
64,1.0,18
66,1.0,23
67,1.0,26


In [35]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,111,5,3.5
1,1.0,111,16,4.5
2,1.0,111,24,2.5
3,1.0,111,34,2.5
4,1.0,111,39,4.0


In [36]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,111,5,3.5,3.5
1,1.0,111,16,4.5,4.5
2,1.0,111,24,2.5,2.5
3,1.0,111,34,2.5,2.5
4,1.0,111,39,4.0,4.0


In [37]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,12.193356,44.828914
2,7.837443,25.651402
3,2.343258,6.595938
4,1.270899,3.509431
5,6.026394,17.634208


In [38]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.676503,1
2,3.27293,2
3,2.814857,3
4,2.761377,4
5,2.926162,5


In [39]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1299,19.205531,1299
3503,17.324413,3503
1261,16.786098,1261
4915,15.589969,4915
3852,12.363591,3852
250,12.203775,250
569,12.203775,569
274,12.203775,274
3032,11.194335,3032
93,10.836479,93


In [40]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres,year
82,93,Vampire in Brooklyn,"[Comedy, Horror, Romance]",1995
214,250,Heavyweights (Heavy Weights),"[Children, Comedy]",1995
236,274,Man of the House,[Comedy],1995
494,569,Little Big League,"[Comedy, Drama]",1994
960,1261,Evil Dead II (Dead by Dawn),"[Action, Comedy, Fantasy, Horror]",1987
997,1299,"Killing Fields, The","[Drama, War]",1984
2285,3032,"Omega Man, The","[Action, Drama, Sci-Fi, Thriller]",1971
2618,3503,Solaris (Solyaris),"[Drama, Mystery, Sci-Fi]",1972
2881,3852,"Tao of Steve, The",[Comedy],2000
3585,4915,"Beastmaster, The","[Action, Adventure, Fantasy]",1982
