# Acquiring the Data from MovieLens

In [1]:
!wget -O ml-25m.zip http://files.grouplens.org/datasets/movielens/ml-25m.zip
!unzip -o -j ml-25m.zip

--2020-05-22 15:20:57--  http://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: ‘ml-25m.zip’


2020-05-22 15:21:01 (78.8 MB/s) - ‘ml-25m.zip’ saved [261978986/261978986]

Archive:  ml-25m.zip
  inflating: tags.csv                
  inflating: links.csv               
  inflating: README.txt              
  inflating: ratings.csv             
  inflating: genome-tags.csv         
  inflating: genome-scores.csv       
  inflating: movies.csv              


# PreProcessing the Data

In [2]:
#Importing Data manipulation library
import pandas as pd
#Importing Array manipulation library
import numpy as np
#Importing Graphical plotting library
import matplotlib.pyplot as plt

In [3]:
#Storing the movie information into a pandas variable
movies_data = pd.read_csv('movies.csv')
#Storing the user information into a pandas variable
ratings_data = pd.read_csv('ratings.csv')

In [4]:
#Head is a function that gets the first N rows of a dataframe. N's default is 5.
print(movies_data.head(10))
print(ratings_data.head(10))

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   
5        6                         Heat (1995)   
6        7                      Sabrina (1995)   
7        8                 Tom and Huck (1995)   
8        9                 Sudden Death (1995)   
9       10                    GoldenEye (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
5                        Action|Crime|Thriller  
6                               Comedy|Romance  
7                           Adventure|Children  
8       

In [5]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_data['year'] = movies_data.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_data['year'] = movies_data.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_data['title'] = movies_data.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_data['title'] = movies_data['title'].apply(lambda x: x.strip())
movies_data.head(10)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
5,6,Heat,Action|Crime|Thriller,1995
6,7,Sabrina,Comedy|Romance,1995
7,8,Tom and Huck,Adventure|Children,1995
8,9,Sudden Death,Action,1995
9,10,GoldenEye,Action|Adventure|Thriller,1995


In [6]:
#Every genre is separated by a | so we simply have to call the split function on |
movies_data['genres'] = movies_data.genres.str.split('|')
movies_data.head(10)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
5,6,Heat,"[Action, Crime, Thriller]",1995
6,7,Sabrina,"[Comedy, Romance]",1995
7,8,Tom and Huck,"[Adventure, Children]",1995
8,9,Sudden Death,[Action],1995
9,10,GoldenEye,"[Action, Adventure, Thriller]",1995


In [7]:
#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_data = movies_data.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_data.iterrows():
    for genre in row['genres']:
        moviesWithGenres_data.at[index, genre] = 1
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_data = moviesWithGenres_data.fillna(0)
moviesWithGenres_data.head(10)

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat,"[Action, Crime, Thriller]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck,"[Adventure, Children]",1995,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death,[Action],1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye,"[Action, Adventure, Thriller]",1995,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#Drop removes a specified row or column from a dataframe
ratings_data = ratings_data.drop('timestamp', 1)
ratings_data.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [9]:
#Merge is a function that combines all the rows of the dataset with the specified dataframe.
data = ratings_data.merge(movies_data,on='movieId', how='left')
data.head(10)

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,296,5.0,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994
1,1,306,3.5,Three Colors: Red (Trois couleurs: Rouge),[Drama],1994
2,1,307,5.0,Three Colors: Blue (Trois couleurs: Bleu),[Drama],1993
3,1,665,5.0,Underground,"[Comedy, Drama, War]",1995
4,1,899,3.5,Singin' in the Rain,"[Comedy, Musical, Romance]",1952
5,1,1088,4.0,Dirty Dancing,"[Drama, Musical, Romance]",1987
6,1,1175,3.5,Delicatessen,"[Comedy, Drama, Romance]",1991
7,1,1217,3.5,Ran,"[Drama, War]",1985
8,1,1237,5.0,"Seventh Seal, The (Sjunde inseglet, Det)",[Drama],1957
9,1,1250,4.0,"Bridge on the River Kwai, The","[Adventure, Drama, War]",1957


# Content-Based Recommendation system

In [10]:
userInput = [
            {'title':'Fast & Furious (Fast and the Furious 4, The)', 'rating':4},
            {'title':'Jumanji: Welcome to the Jungle', 'rating':4.5},
            {'title':'Secret Superstar', 'rating':3},
            {'title':"Captain Marvel", 'rating':3.5},
            {'title':'Intern, The', 'rating':5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,4.0,"Fast & Furious (Fast and the Furious 4, The)"
1,4.5,Jumanji: Welcome to the Jungle
2,3.0,Secret Superstar
3,3.5,Captain Marvel
4,5.0,"Intern, The"


In [11]:
#Filtering out the movies by title
inputId = movies_data[movies_data['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('genres', 1).drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

Unnamed: 0,movieId,title,rating
0,67923,"Fast & Furious (Fast and the Furious 4, The)",4.0
1,79022,"Intern, The",5.0
2,122910,Captain Marvel,3.5
3,179397,Secret Superstar,3.0
4,179401,Jumanji: Welcome to the Jungle,4.5


In [12]:
#Filtering out the movies from the input
userMovies = moviesWithGenres_data[moviesWithGenres_data['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
13226,67923,"Fast & Furious (Fast and the Furious 4, The)","[Action, Crime, Drama, Thriller]",2009,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14914,79022,"Intern, The",[Comedy],2000,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25066,122910,Captain Marvel,"[Action, Adventure, Sci-Fi]",2018,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49685,179397,Secret Superstar,[Drama],2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49687,179401,Jumanji: Welcome to the Jungle,"[Action, Adventure, Children]",2017,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
#Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
inputMovies['rating']

0    4.0
1    5.0
2    3.5
3    3.0
4    4.5
Name: rating, dtype: float64

In [15]:
#Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
#The user profile
userProfile

Adventure              8.0
Animation              0.0
Children               4.5
Comedy                 5.0
Fantasy                0.0
Romance                0.0
Drama                  7.0
Action                12.0
Crime                  4.0
Thriller               4.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 3.5
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [16]:
#Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_data.set_index(moviesWithGenres_data['movieId'])
#And drop the unnecessary information
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
genreTable.shape

(62423, 20)

In [18]:
#Multiply the genres by the weights and then take the weighted average
recommendationTable_data = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_data.head()

movieId
1    0.364583
2    0.260417
3    0.104167
4    0.250000
5    0.104167
dtype: float64

In [19]:
#Sort our recommendations in descending order
recommendationTable_data = recommendationTable_data.sort_values(ascending=False)
#Just a peek at the values
recommendationTable_data.head()

movieId
144324    0.833333
122787    0.833333
81132     0.833333
64645     0.833333
115479    0.802083
dtype: float64

In [20]:
#The final recommendation table
movies_data.loc[movies_data['movieId'].isin(recommendationTable_data.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
4850,4956,"Stunt Man, The","[Action, Adventure, Comedy, Drama, Romance, Th...",1980
5546,5657,Flashback,"[Action, Adventure, Comedy, Crime, Drama]",1990
6865,6990,The Great Train Robbery,"[Action, Adventure, Comedy, Crime, Drama]",1978
9315,27735,Unstoppable,"[Action, Adventure, Comedy, Drama, Thriller]",2004
11829,55116,"Hunting Party, The","[Action, Adventure, Comedy, Drama, Thriller]",2007
12879,64645,The Wrecking Crew,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1968
13238,68033,Vigilante Force,"[Action, Adventure, Crime, Drama, Thriller]",1976
13380,69095,Graduation,"[Action, Adventure, Comedy, Crime, Drama]",2007
15389,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
15793,83266,Kaho Naa... Pyaar Hai,"[Action, Adventure, Comedy, Drama, Mystery, Ro...",2000


# Advantages and Disadvantages of Content-Based Filtering
## Advantages
    Learns user's preferences
    Highly personalized for the user
## Disadvantages
    Doesn't take into account what others think of the item, so low quality item recommendations might happen
    Extracting data is not always intuitive
    Determining what characteristics of the item the user dislikes or likes is not always obvious

# Collaborative Filtering based Recommendation System

In [21]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_data[ratings_data['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
683,3,67923,3.5
1140,4,179401,3.5
6289,44,67923,4.5
6754,51,179401,4.0
14751,114,179401,3.5


In [22]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [23]:
userSubsetGroup.get_group(114)

Unnamed: 0,userId,movieId,rating
14751,114,179401,3.5


In [24]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [25]:
userSubsetGroup[0:3]

[(6115,         userId  movieId  rating
  909126    6115    67923     3.5
  909480    6115   122910     4.5
  909633    6115   179397     3.5
  909634    6115   179401     3.5), (123832,           userId  movieId  rating
  19097191  123832    67923     3.0
  19097451  123832   122910     4.0
  19097752  123832   179397     4.0
  19097753  123832   179401     3.5), (130333,
            userId  movieId  rating
  20049601  130333    67923     4.5
  20050313  130333   122910     2.0
  20050995  130333   179397     0.5
  20050996  130333   179401     3.5)]

In [26]:
userSubsetGroup = userSubsetGroup[0:100]

In [27]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        val = (Sxx*Syy)**(1/2)
        pearsonCorrelationDict[name] = Sxy/(val)
    else:
        pearsonCorrelationDict[name] = 0

In [28]:
pearsonCorrelationDict.items()

dict_items([(6115, -0.2581988897471611), (123832, -0.674199862463242), (130333, 0.8483677805978151), (997, 0), (1977, -0.9538209664765325), (4019, -0.9819805060619666), (4429, -0.1889822365046136), (4515, 0), (4642, 0.5), (5694, 0), (6184, 0), (6729, 0.0), (8476, 0.0), (9770, 0.0), (10502, 0.8660254037844448), (11005, 0.8660254037844387), (12552, 0.0), (13134, 0.0), (16774, 0.8660254037844355), (17669, 0), (18265, 0.8660254037844448), (18434, 0.8660254037844448), (18500, 1.0), (18580, -0.5), (19379, 0.0), (19475, -0.9933992677987827), (20068, 0), (20302, -0.8660254037844402), (20917, 0.5), (22241, 0.0), (22349, 0.8660254037844402), (22840, 0.0), (24523, 0.8660254037844402), (24906, 0.5), (25716, 0.8660254037844448), (28812, 0.8660254037844355), (30643, -0.32732683535398843), (32922, -0.1889822365046136), (33069, 0.8660254037844355), (35734, 0.8660254037844448), (37246, 0), (37438, 1.0), (37635, -0.8660254037844448), (38588, -1.0), (38672, 0.0), (38806, 0.0), (40006, 0.8660254037844448)

In [29]:
pearsonData = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonData.columns = ['similarityIndex']
pearsonData['userId'] = pearsonData.index
pearsonData.index = range(len(pearsonData))
pearsonData.head()

Unnamed: 0,similarityIndex,userId
0,-0.258199,6115
1,-0.6742,123832
2,0.848368,130333
3,0.0,997
4,-0.953821,1977


In [30]:
topUsers=pearsonData.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
22,1.0,18500
84,1.0,81694
48,1.0,45948
41,1.0,37438
71,0.981981,66315


In [31]:
topUsersRating=topUsers.merge(ratings_data, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,18500,1,4.5
1,1.0,18500,2,3.0
2,1.0,18500,6,4.5
3,1.0,18500,9,3.0
4,1.0,18500,16,4.0


In [32]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,18500,1,4.5,4.5
1,1.0,18500,2,3.0,3.0
2,1.0,18500,6,4.5,4.5
3,1.0,18500,9,3.0,3.0
4,1.0,18500,16,4.0,4.0


In [33]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,27.056808,108.27377
2,19.817809,65.820656
3,3.155066,8.917317
4,0.240192,0.120096
5,3.387478,8.199806


In [34]:
#Creates an empty dataframe
recommendation_data = pd.DataFrame()
#Now we take the weighted average
recommendation_data['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_data['movieId'] = tempTopUsersRating.index
recommendation_data.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.00172,1
2,3.321288,2
3,2.826349,3
4,0.5,4
5,2.420622,5


In [35]:
recommendation_data = recommendation_data.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_data.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
82037,5.0,82037
2677,5.0,2677
96563,5.0,96563
101971,5.0,101971
101648,5.0,101648
100540,5.0,100540
99549,5.0,99549
168846,5.0,168846
99045,5.0,99045
196115,5.0,196115


In [36]:
movies_data.loc[movies_data['movieId'].isin(recommendation_data.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres,year
2585,2677,Buena Vista Social Club,"[Documentary, Musical]",1999
15575,82037,"Tillman Story, The",[Documentary],2010
18436,96563,Paradise Lost 3: Purgatory,[Documentary],2011
19004,99045,Aftershock (Tangshan dadizhen),"[Drama, IMAX]",2010
19106,99549,Mansome,[Documentary],2012
19361,100540,Bronies: The Extremely Unexpected Adult Fans o...,[Documentary],2012
19557,101648,"Flat, The",[Documentary],2011
19643,101971,Never Sleep Again: The Elm Street Legacy,[Documentary],2010
44728,168846,Neal Brennan: 3 Mics,[Comedy],2017
57325,196115,Prosecuting Evil: The Extraordinary World of B...,[Documentary],2018


# Advantages and Disadvantages of Collaborative Filtering
## Advantages
    Takes other user's ratings into consideration
    Doesn't need to study or extract information from the recommended item
    Adapts to the user's interests which might change over time
## Disadvantages
    Approximation function can be slow
    There might be a low of amount of users to approximate
    Privacy issues when trying to learn the user's preferences