In [None]:
!wget -O moviedataset.zip https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
print('unziping ...')
!unzip -o -j moviedataset.zip 

--2020-07-17 19:30:28--  https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
Resolving s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)... 67.228.254.196
Connecting to s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 160301210 (153M) [application/zip]
Saving to: ‘moviedataset.zip’


2020-07-17 19:30:34 (30.2 MB/s) - ‘moviedataset.zip’ saved [160301210/160301210]

unziping ...
Archive:  moviedataset.zip
  inflating: links.csv               
  inflating: movies.csv              
  inflating: ratings.csv             
  inflating: README.txt              
  inflating: tags.csv                


**DATA PREPROCESSING**

In [None]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
movies=pd.read_csv('movies.csv')
rating=pd.read_csv('ratings.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [None]:
# remove year from title and create another column for year
movies['year']=movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies['year']=movies.year.str.extract('(\d\d\d\d)',expand=False)
movies['title']=movies.title.str.replace('(\(\d\d\d\d\))','')
movies['title']=movies['title'].apply(lambda x:x.strip())
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [None]:
# drop genres
movies=movies.drop('genres',1)
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [None]:
# drop timestamp from rating
rating=rating.drop('timestamp',1)
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


**COLLABORATIVE FILTERING - USER FILTERING**

In [None]:
# create user input
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
user=pd.DataFrame(userInput)
user.head()

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [None]:
inpId=movies[movies['title'].isin(user['title'].tolist())]
user=pd.merge(inpId,user)
user=user.drop('year',1)
user.head()

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [None]:
#users who has seen the movie
userSub=rating[rating['movieId'].isin(user['movieId'].tolist())]
userSub=pd.merge(userSub,rating)
userSub.head()

Unnamed: 0,userId,movieId,rating
0,4,296,4.0
1,12,1968,3.0
2,13,2,2.0
3,13,1274,5.0
4,14,296,2.0


In [None]:
# groupup rows by userId
userGp=userSub.groupby(['userId'])
userGp.get_group(1130)

Unnamed: 0,userId,movieId,rating
900,1130,1,0.5
901,1130,2,4.0
902,1130,296,4.0
903,1130,1274,4.5
904,1130,1968,4.5


In [None]:
#Sorting it so users with movie most in common with the input will have priority
userGp=sorted(userGp, key=lambda x: len(x[1]),reverse=True)
userGp[0:3]

[(75,     userId  movieId  rating
  54      75        1     5.0
  55      75        2     3.5
  56      75      296     5.0
  57      75     1274     4.5
  58      75     1968     5.0), (106,     userId  movieId  rating
  72     106        1     2.5
  73     106        2     3.0
  74     106      296     3.5
  75     106     1274     3.0
  76     106     1968     3.5), (686,      userId  movieId  rating
  538     686        1     4.0
  539     686        2     3.0
  540     686      296     4.0
  541     686     1274     4.0
  542     686     1968     5.0)]

**Similarities of users to input user**

In [None]:
# reducing the amount of data for time and memory consuming
userGp=userGp[0:100]

In [None]:
# Calculate Pearson Correlation
pc={}
for name, group in userGp:
  # sorting the input and current user group so the values aren't mixed up later on
  group=group.sort_values(by='movieId')
  inputMovies=user.sort_values(by='movieId')
  n=len(group)
  temp=inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
  tempRating=temp['rating'].tolist()
  tempGroup=group['rating'].tolist()
  #pearson correlation
  xx=sum([i**2 for i in tempRating]) - pow(sum(tempRating),2)/float(n)
  yy = sum([i**2 for i in tempGroup]) - pow(sum(tempGroup),2)/float(n)
  xy = sum( i*j for i, j in zip(tempRating, tempGroup)) - sum(tempRating)*sum(tempGroup)/float(n)
  #If the denominator is different than zero, then divide, else, 0 correlation.
  if xx != 0 and yy != 0:
        pc[name] = xy/sqrt(xx*yy)
  else:
        pc[name] = 0

In [None]:
pc.items()

dict_items([(75, 0.8272781516947562), (106, 0.5860090386731182), (686, 0.8320502943378437), (815, 0.5765566601970551), (1040, 0.9434563530497265), (1130, 0.2891574659831201), (1502, 0.8770580193070299), (1599, 0.4385290096535153), (1625, 0.716114874039432), (1950, 0.179028718509858), (2065, 0.4385290096535153), (2128, 0.5860090386731196), (2432, 0.1386750490563073), (2791, 0.8770580193070299), (2839, 0.8204126541423674), (2948, -0.11720180773462392), (3025, 0.45124262819713973), (3040, 0.89514359254929), (3186, 0.6784622064861935), (3271, 0.26989594817970664), (3429, 0.0), (3734, -0.15041420939904673), (4099, 0.05860090386731196), (4208, 0.29417420270727607), (4282, -0.4385290096535115), (4292, 0.6564386345361464), (4415, -0.11183835382312353), (4586, -0.9024852563942795), (4725, -0.08006407690254357), (4818, 0.4885967564883424), (5104, 0.7674257668936507), (5165, -0.4385290096535153), (5547, 0.17200522903844556), (6082, -0.04728779924109591), (6207, 0.9615384615384616), (6366, 0.65779

In [None]:
pcDF=pd.DataFrame.from_dict(pc,orient='index')
pcDF.columns=['similarityIndex']
pcDF['userId']=pcDF.index
pcDF.index=range(len(pcDF))
pcDF.head()

Unnamed: 0,similarityIndex,userId
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [None]:
# find top x similar users to input users
topUsers=pcDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040


In [None]:
#Rating of selected users to all movies
topRating=topUsers.merge(rating,left_on='userId',right_on='userId',how='inner')
topRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5


In [None]:
#Multiplies the similarity by the user's ratings
topRating['weightedRating'] = topRating['similarityIndex']*topRating['rating']
topRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.961678,12325,1,3.5,3.365874
1,0.961678,12325,2,1.5,1.442517
2,0.961678,12325,3,3.0,2.885035
3,0.961678,12325,5,0.5,0.480839
4,0.961678,12325,6,2.5,2.404196


In [None]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.376281,140.800834
2,38.376281,96.656745
3,10.253981,27.254477
4,0.929294,2.787882
5,11.723262,27.151751


In [None]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.668955,1
2,2.518658,2
3,2.657941,3
4,3.0,4
5,2.316058,5


**The top 20 movies that the algorithm recommended**

In [None]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
5073,5.0,5073
3329,5.0,3329
2284,5.0,2284
26801,5.0,26801
6776,5.0,6776
6672,5.0,6672
3759,5.0,3759
3769,5.0,3769
3775,5.0,3775
90531,5.0,90531


In [None]:
movies.loc[movies['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
2200,2284,Bandit Queen,1994
3243,3329,"Year My Voice Broke, The",1987
3669,3759,Fun and Fancy Free,1947
3679,3769,Thunderbolt and Lightfoot,1974
3685,3775,Make Mine Music,1946
4978,5073,"Son's Room, The (Stanza del figlio, La)",2001
6563,6672,War Photographer,2001
6667,6776,Lagaan: Once Upon a Time in India,2001
9064,26801,Dragon Inn (Sun lung moon hak chan),1992
18106,90531,Shame,2011
