# Collaborative Filtering
## Machine Learning 
### Recomender Systems
### MOVIES
### RATING 

## Imports 

In [17]:
import pandas as pd 
from math import sqrt
import numpy as np

## Data Set 

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings_sample.csv')
movies_df


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
34203,151697,Grand Slam (1967),Thriller
34204,151701,Bloodmoney (2010),(no genres listed)
34205,151703,The Butterfly Circus (2009),Drama
34206,151709,Zero (2015),Drama|Sci-Fi


In [3]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))' , expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)' , expand=False)
movies_df['title'] = movies_df['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)
movies_df['title'] = movies_df['title'].str.strip()
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [4]:
movies_df = movies_df.drop(columns='genres',axis=1)
movies_df

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995
...,...,...,...
34203,151697,Grand Slam,1967
34204,151701,Bloodmoney,2010
34205,151703,The Butterfly Circus,2009
34206,151709,Zero,2015


In [5]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496
...,...,...,...,...
3899994,42130,1028,4.0,860215847
3899995,42130,1030,3.0,860216420
3899996,42130,1035,5.0,860215295
3899997,42130,1066,3.0,860218569


In [6]:
ratings_df = ratings_df.drop(columns='timestamp',axis=1)
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0
...,...,...,...
3899994,42130,1028,4.0
3899995,42130,1030,3.0
3899996,42130,1035,5.0
3899997,42130,1066,3.0




# Collaborative Filtering


In [7]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [8]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop(columns='year', axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [9]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup.get_group(1130)

  userSubsetGroup.get_group(1130)


Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [10]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[0:3]

[((75,),
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 ((106,),
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 ((686,),
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0)]

In [11]:
pearsonCorrelationDict = {}
for name , group in userSubsetGroup:
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    nRatings = len(group)
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    
pearsonCorrelationDict.items()




dict_items([((75,), 0.8272781516947562), ((106,), 0.5860090386731182), ((686,), 0.8320502943378437), ((815,), 0.5765566601970551), ((1040,), 0.9434563530497265), ((1130,), 0.2891574659831201), ((1502,), 0.8770580193070299), ((1599,), 0.4385290096535153), ((1625,), 0.716114874039432), ((1950,), 0.179028718509858), ((2065,), 0.4385290096535153), ((2128,), 0.5860090386731196), ((2432,), 0.1386750490563073), ((2791,), 0.8770580193070299), ((2839,), 0.8204126541423674), ((2948,), -0.11720180773462392), ((3025,), 0.45124262819713973), ((3040,), 0.89514359254929), ((3186,), 0.6784622064861935), ((3271,), 0.26989594817970664), ((3429,), 0.0), ((3734,), -0.15041420939904673), ((4099,), 0.05860090386731196), ((4208,), 0.29417420270727607), ((4282,), -0.4385290096535115), ((4292,), 0.6564386345361464), ((4415,), -0.11183835382312353), ((4586,), -0.9024852563942795), ((4725,), -0.08006407690254357), ((4818,), 0.4885967564883424), ((5104,), 0.7674257668936507), ((5165,), -0.4385290096535153), ((554

In [12]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.827278,"(75,)"
1,0.586009,"(106,)"
2,0.83205,"(686,)"
3,0.576557,"(815,)"
4,0.943456,"(1040,)"


In [13]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
3483,1.0,"(40314,)"
3434,1.0,"(39386,)"
3579,1.0,"(42025,)"
3238,1.0,"(36011,)"
3239,1.0,"(36019,)"


In [20]:
topUsers['userId']=topUsers['userId'].astype(np.str_)
ratings_df['userId'] = ratings_df['userId'].astype(np.str_)
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()


Unnamed: 0,similarityIndex,userId,movieId,rating


In [25]:
common_user = set(topUsers['userId']).intersection(set(ratings_df['userId']))
print(len(common_user))
print("________________________________________________________")
#-----------------------------------------------------------------------------------
print(ratings_df['userId'])
print("________________________________________________________")
print(topUsers['userId'])

0
________________________________________________________
0              1
1              1
2              1
3              2
4              2
           ...  
3899994    42130
3899995    42130
3899996    42130
3899997    42130
3899998    42130
Name: userId, Length: 3899999, dtype: object
________________________________________________________
3483    (40314,)
3434    (39386,)
3579    (42025,)
3238    (36011,)
3239    (36019,)
3046    (32451,)
2243    (17971,)
2369    (20192,)
1754     (9749,)
3496    (40611,)
1660     (8051,)
1279      (707,)
2551    (23330,)
3329    (37463,)
2141    (16326,)
3052    (32518,)
2082    (15421,)
1996    (13898,)
3482    (40297,)
2850    (29143,)
1709     (8902,)
1793    (10372,)
3411    (38935,)
3285    (36820,)
2977    (31330,)
1424     (3699,)
3373    (38180,)
1724     (9093,)
1878    (11907,)
2641    (25033,)
2254    (18163,)
1306     (1213,)
2481    (22251,)
2352    (19924,)
2761    (27400,)
1846    (11383,)
1344     (1898,)
3321    (37356,)
1409  

In [38]:
topUsers['userId'] = topUsers['userId'].str.replace(',','',regex=False)
topUsers['userId'] = topUsers['userId'].str.replace('(','',regex=False)
topUsers['userId'] = topUsers['userId'].str.replace(')','',regex=False)
topUsers['userId']=topUsers['userId'].astype(int)
topUsers['userId']

3483    40314
3434    39386
3579    42025
3238    36011
3239    36019
3046    32451
2243    17971
2369    20192
1754     9749
3496    40611
1660     8051
1279      707
2551    23330
3329    37463
2141    16326
3052    32518
2082    15421
1996    13898
3482    40297
2850    29143
1709     8902
1793    10372
3411    38935
3285    36820
2977    31330
1424     3699
3373    38180
1724     9093
1878    11907
2641    25033
2254    18163
1306     1213
2481    22251
2352    19924
2761    27400
1846    11383
1344     1898
3321    37356
1409     3408
3187    35085
2152    16549
3281    36775
2124    16140
3140    34327
1654     7984
1449     4145
2304    19100
1429     3843
2359    20082
3288    36837
Name: userId, dtype: int64

In [39]:
ratings_df['userId'] = ratings_df['userId'].astype(int)
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()


Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,40314,1,3.5
1,1.0,40314,6,4.0
2,1.0,40314,16,4.5
3,1.0,40314,18,4.5
4,1.0,40314,24,3.5


In [40]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,40314,1,3.5,3.5
1,1.0,40314,6,4.0,4.0
2,1.0,40314,16,4.5,4.5
3,1.0,40314,18,4.5,4.5
4,1.0,40314,24,3.5,3.5


In [41]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,43.0,143.5
2,7.0,20.0
3,6.0,16.0
4,4.0,6.5
5,8.0,18.5


In [42]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.337209,1
2,2.857143,2
3,2.666667,3
4,1.625,4
5,2.3125,5


In [43]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1112,5.0,1112
1161,5.0,1161
1264,5.0,1264
1256,5.0,1256
1273,5.0,1273
1289,5.0,1289
60904,5.0,60904
1010,5.0,1010
67422,5.0,67422
123,5.0,123


In [44]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
121,123,Chungking Express (Chung Hing sam lam),1994
991,1010,"Love Bug, The",1969
1089,1112,Palookaville,1996
1138,1161,"Tin Drum, The (Blechtrommel, Die)",1979
1228,1256,Duck Soup,1933
1236,1264,Diva,1981
1245,1273,Down by Law,1986
1261,1289,Koyaanisqatsi (a.k.a. Koyaanisqatsi: Life Out ...,1983
12904,60904,Heart of a Dog (Sobachye serdtse),1988
13588,67422,California Dreamin' (Nesfarsit),2007
