In [1]:
# This Lab has been implemented using 2 methods
# This is a recommender system for IMDB movies
# Documentaion : Masoud Salehi
# Author: Saeed Aghabozorgi
# © IBM Corporation 2020.

In [2]:
# First Method
# Documentaion
# Collaborative Filtering (User-Based)
# We have a table like the following table that includes the list of the poeple who have seen and rated some movies:
#-----------------------------Rating Matrix-----------------------------
#-------------------------------MovieIds--------------------------------
# PersonIds      Movie1     Movie2      Movie3      Movie4       Movie5 
# Person1       rating:9   rating:6    rating:8    rating:4     Not seen
# Person2       rating:2   rating:10   rating:6    Not seen     rating:8
# Person3       rating:5   rating:9    Not seen    rating:10    rating:7
# Person4       rating:?   rating:10   rating:7    rating:8     rating:?

# We want to recommend a movie to Person4; Which movie would be a better recommendation? Movie1 or Movie5?
# Since we are using user-based collaborative Filtering method we will make the recommendation according to the-
# Similarity of the Person4 and the others:

# Assume we know how similar is every other Person is to Person4:

#-----------Similarity Matrix----------
# PersonIds      Similarity to Person4
# Person1               0.4
# Person2               0.9
# Person3               0.7

#--------Weighted Ratings Matrix--------------
# PersonIds        Movie1           Movie5 
# Person1       9 * 0.4 = 3.6      Not seen
# Person2       2 * 0.9 = 1.8    8 * 0.9 = 7.2
# Person3       5 * 0.7 = 3.5    7 * 0.7 = 4.9

#----------------Sum of the Weighted Ratings Matrix--------------
# PersonIds              Movie1                       Movie5 
# Person2,3       ---------------------          7.2 + 4.9 = 12.1
#                                                 (/(0.4+0.7))
# Person1,2,3     1.8 + 3.5 + 3.6 = 8.9          ----------------
#                  (/(0.4+0.9+0.7))

#---------------Recommendation Matrix-------
# MovieId     Movie1     Movie5
# Score        4.45        11

# Therefore We recommend Movie5 to Person4


In [3]:
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings = pd.read_csv('ratings.csv')
ratings_df = ratings
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings = ratings.drop('timestamp', 1)
ratings.head()

  ratings = ratings.drop('timestamp', 1)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
# Using regex we will extract the movies'year productions and put them in a new column 
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
movies['title'] = movies['title'].apply(lambda x: x.strip())

  movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')


In [8]:
movies_df = movies

In [9]:
# There are some data that we do not need them for our recommendation system 
# Let's remove them
movies = movies.drop('genres', 1)
movies.head()

  movies = movies.drop('genres', 1)


Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [10]:
# We assume that there is a user who has seen and rated the below movies
# We store the information for this user in a new DataFrame
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [11]:
# Now we have the table of inputMovies, but the problem is that it only contains the name of the movies not their IDs.
# From the movies Datafram select the movies that are in the inputMovies and store them in inputId
inputId = movies[movies['title'].isin(inputMovies['title'].tolist())]
inputId.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
257,296,Pulp Fiction,1994
973,1274,Akira,1988
1445,1968,"Breakfast Club, The",1985


In [12]:
# if we merge to DataFrame of inputId and inputMovies, we will have a dataframe containing all the required fields.
inputMovies = pd.merge(inputId, inputMovies)
inputMovies.head()

Unnamed: 0,movieId,title,year,rating
0,1,Toy Story,1995,3.5
1,2,Jumanji,1995,2.0
2,296,Pulp Fiction,1994,5.0
3,1274,Akira,1988,4.5
4,1968,"Breakfast Club, The",1985,5.0


In [13]:
# But we do not need the production column, thus dropping it:
inputMovies = inputMovies.drop('year', 1)

  inputMovies = inputMovies.drop('year', 1)


In [14]:
inputMovies.head()

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [15]:
# From the ratings we seperate all those movies that where in inputMovies as well.
# So we will have a DataFrame showing Which Users have seen and rated the same movie as that InputUser

userSubset = ratings[ratings['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0


In [16]:
# Now we put all the Movies seen by one person in one single group
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0
...,...,...,...
99510,609,296,4.0
99534,610,1,5.0
99552,610,296,5.0
99636,610,1274,5.0


In [17]:
# Let's sort the userSubsetGroup according to the number of the equel movies to the inputMovies each person has seen in a ascending layout.
# More Movies equel to inputMovies, higher place in the Table
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[0:3]

[(91,
         userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0),
 (177,
         userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5),
 (219,
         userId  movieId  rating
  31524     219        1     3.5
  31525     219        2     2.5
  31554     219      296     4.0
  31628     219     1274     2.5
  31680     219     1968     3.0)]

In [18]:
# Let's create the ***similarity matrix*** using the calculation of the Pearson Correlation for ratings
# Indeed, according the ratings we decide how similar are to people
pearsonCorrelationDict = {}

#For every user in each group in our subset
for name, group in userSubsetGroup:
    # Sort Movies in the current group
    group = group.sort_values(by='movieId')
    # Sort Movies in the inputMovies
    inputMovies = inputMovies.sort_values(by='movieId')
    # How many Movies are there in the current group?
    nRatings = len(group)
    # Store all the movies from the inputMovies which are in the current group.
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    # List of the ratings of the inputMovies which are in the current group.
    tempRatingList = temp_df['rating'].tolist()
    # List of the the ratings from inputMovies where the movies are same as the movies in the current group.
    
    # To calculate how much similar si our current group to the inputuser, we use Pearson correlation
    # The distance is calculated base on the difference of the ratings.
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [19]:
pearsonCorrelationDict

{91: 0.43852900965351443,
 177: 0.0,
 219: 0.45124262819713973,
 274: 0.716114874039432,
 298: 0.9592712306918567,
 414: 0.9376144618769914,
 474: 0.11720180773462392,
 477: 0.4385290096535153,
 480: 0.7844645405527362,
 483: 0.08006407690254357,
 599: 0.7666866491579839,
 608: 0.920736884379251,
 50: 0.15713484026367722,
 57: -0.7385489458759964,
 68: 0.0,
 103: 0.5222329678670935,
 135: 0.8703882797784892,
 182: 0.9428090415820635,
 202: 0.5222329678670935,
 217: 0.30151134457776363,
 226: 0.9438798074485389,
 288: 0.6005325641789633,
 307: 0.9655810287305759,
 318: 0.44486512077567225,
 322: 0.5057805388588731,
 330: 0.9035942578600878,
 357: 0.5606119105813882,
 434: 0.9864036607532465,
 448: 0.30151134457776363,
 469: 0.8164965809277261,
 561: 0.5222329678670935,
 600: 0.18442777839082938,
 606: 0.9146591207600472,
 610: -0.47140452079103173,
 18: 1.0,
 19: -0.5,
 21: 0,
 45: 0.5000000000000009,
 63: -0.4999999999999982,
 64: 0.0,
 66: 0.5000000000000009,
 107: -1.0,
 122: 0.86602

In [20]:
# Similarity Matrix
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.438529,91
1,0.0,177
2,0.451243,219
3,0.716115,274
4,0.959271,298


In [21]:
# Users with the most similarity
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
43,1.0,132
181,1.0,382
219,1.0,602
130,1.0,130
129,1.0,125


In [22]:
# Adding the MovieId and the rating columns
# We will use this Table to create the Weighted Ratings Matrix 
topUsersRating=topUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,132,1,2.0
1,1.0,132,17,3.0
2,1.0,132,29,2.0
3,1.0,132,32,3.0
4,1.0,132,34,1.5


In [23]:
# Weighted Ratings Matrix
# To create the  Weighted Ratings Matrix we must multplies the smilarity column by user's rating.
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,132,1,2.0,2.0
1,1.0,132,17,3.0,3.0
2,1.0,132,29,2.0,2.0
3,1.0,132,32,3.0,3.0
4,1.0,132,34,1.5,1.5


In [24]:
# Sum of the Weighted Ratings Matrix
# For each movie we must sum up the weightedRaings 
# We also sum up the similarity index for each movie to create the recommendation matrix in the next step.
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,36.0,124.0
2,18.0,58.0
3,3.0,11.0
5,3.0,8.5
6,13.0,49.5


In [25]:
# Recommendation Matrix
# Creates an empty dataframe
recommendation_df1 = pd.DataFrame()

# Creating the recommendation matrix
recommendation_df1['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df1['movieId'] = tempTopUsersRating.index
recommendation_df1.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.444444,1
2,3.222222,2
3,3.666667,3
5,2.833333,5
6,3.807692,6


In [26]:
# Second Method
# Content-Based
# Documentation
#-----The input user has rated the movies like bellow----
# Step One         Multiply Genres by Ratings
# MovieIds      Rating    MovieIds   Comedy  Adventure  Super-Hero Sc-Fi    MovieIds   Comedy  Adventure  Super-Hero Sc-Fi
# Movie1          2        Movie1       0       1           1        0       Movie1       0       2           2        0
# Movie2          10    *  Movie2       1       1           1        1   =   Movie2       10      10          10       10
# Movie3          8        Movie3       1       0           1        0       Movie3       8       0           8        0
#                                                                            Sum of the Scores = 2+2+2+10+10+10+8 = 60
# Step2:   User Profile:  Comedy           Adventure      Super-Hero        Sc-Fi
#                     (8+10)/60=0.3     (10+2)/60=0.2    (8+10+2)/60=0.33  10/60=0.16 * genre = 
# Step3
#--------------Weighted Average Matrix-----------
# MovieIds   Comedy  Adventure  Super-Hero Sc-Fi                                       MovieIds      Sum of the rows
# Movie1      0.3     0.2         0        0.16                                        Movie1         0.66
# Movie2      10      10          10       10   -----> Recommendation Matrix           Movie2         0.33
# Movie3      10      10          10       10                                          Movie3         0.63

In [27]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [28]:
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [29]:
# Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

# For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
# Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [31]:
# this is the same input user from the previous method
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [32]:
# Picking up only the movies that our user has seen from all the movies.
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('genres', 1).drop('year', 1)
inputMovies

  inputMovies = inputMovies.drop('genres', 1).drop('year', 1)


Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [33]:
# Selecting only the movies that are in ipnutMovies
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
257,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
973,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1445,1968,"Breakfast Club, The","[Comedy, Drama]",1985,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
# Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
userGenreTable

  userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)


Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
inputMovies['rating']

0    3.5
1    2.0
2    5.0
3    4.5
4    5.0
Name: rating, dtype: float64

In [36]:
#---------------------Step One----Multiply Genres by Ratings---------------
#Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
#The user profile
userProfile

Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.5
Fantasy                5.5
Romance                0.0
Drama                 10.0
Action                 4.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
War                    0.0
Musical                0.0
Documentary            0.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [37]:
#Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
#And drop the unnecessary information
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.head()

  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)


Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
genreTable.shape

(9742, 20)

In [39]:
# Step2,3 
# Multiply the genres by the weights and then take the weighted average
recommendation_df2 = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendation_df2.head()

movieId
1    0.594406
2    0.293706
3    0.188811
4    0.328671
5    0.188811
dtype: float64

In [40]:
#Sort our recommendations in descending order
recommendation_df2 = recommendation_df2.sort_values(ascending=False)
#Just a peek at the values
recommendation_df2.head()

movieId
134853    0.734266
148775    0.685315
117646    0.678322
6902      0.678322
81132     0.671329
dtype: float64