In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances


### Read the movie lens data set

In [3]:
data=pd.read_table('u.data',sep='\t',header=None)
movie=pd.read_table('u.item',sep='|',encoding='latin-1',header=None)
data.columns=['userId','itemId','rating','timestamp']
movie.columns=['itemId','title','release date','video release date','IMDb URL','unknown','Action','Adventure','Animation',
             'Children''s','Comedy','Crime','Documentary','Drama','Fantasy',
             'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
             'Thriller','War','Western']

In [40]:
Mean = data.groupby(by="userId",as_index=False)['rating'].mean()
Rating_avg = pd.merge(data,Mean,on='userId')
Rating_avg['stdzd_rating']=Rating_avg['rating_x']-Rating_avg['rating_y']
Rating_avg.head()

Unnamed: 0,userId,itemId,rating_x,timestamp,rating_y,stdzd_rating
0,196,242,3,881250949,3.615385,-0.615385
1,196,393,4,881251863,3.615385,0.384615
2,196,381,4,881251728,3.615385,0.384615
3,196,251,3,881251274,3.615385,-0.615385
4,196,655,5,881251793,3.615385,1.384615


### Create a pivot Table of the dataset

In [5]:
check = pd.pivot_table(Rating_avg,values='rating_x',index='userId',columns='itemId')
check.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [41]:

final = pd.pivot_table(Rating_avg,values='stdzd_rating',index='userId',columns='itemId')
final.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.610294,0.389706,-0.610294,-0.610294,1.389706,0.389706,-2.610294,1.389706,-0.610294,...,,,,,,,,,,
2,0.290323,,,,,,,,,-1.709677,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.125714,0.125714,,,,,,,,,...,,,,,,,,,,


### Impute null values with Mean()

In [8]:
# Replacing NaN by Movie Average
final_movie = final.fillna(final.mean(axis=0))

# Replacing NaN by user Average
final_user = final.apply(lambda row: row.fillna(row.mean()), axis=1)

In [9]:
final_movie.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.610294,0.389706,-0.610294,-0.610294,1.389706,0.389706,-2.610294,1.389706,-0.610294,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931
2,0.290323,-0.253455,-0.406476,-0.02917,-0.206708,0.099592,0.241369,0.370904,0.316282,-1.709677,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931
3,0.299264,-0.253455,-0.406476,-0.02917,-0.206708,0.099592,0.241369,0.370904,0.316282,0.251461,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931
4,0.299264,-0.253455,-0.406476,-0.02917,-0.206708,0.099592,0.241369,0.370904,0.316282,0.251461,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931
5,1.125714,0.125714,-0.406476,-0.02917,-0.206708,0.099592,0.241369,0.370904,0.316282,0.251461,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931


In [10]:
final_user.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.6102941,0.3897059,-0.6102941,-0.6102941,1.389706,0.3897059,-2.610294,1.389706,-0.6102941,...,2.579636e-16,2.579636e-16,2.579636e-16,2.579636e-16,2.579636e-16,2.579636e-16,2.579636e-16,2.579636e-16,2.579636e-16,2.579636e-16
2,0.2903226,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,-1.709677,...,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16,4.655774e-16
3,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,...,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16,1.151342e-16
4,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,...,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16
5,1.125714,0.1257143,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,...,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16,2.131628e-16


### Calculate Cosine Similiarity

In [11]:
# user similarity on replacing NAN by user avg
b = cosine_similarity(final_user)
np.fill_diagonal(b, 0 )
similarity_with_user = pd.DataFrame(b,index=final_user.index)
similarity_with_user.columns=final_user.index
similarity_with_user.head()

userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.043411,0.011051,0.059303,0.134514,0.103373,0.110556,0.180891,0.012253,-0.000621,...,0.025835,-0.047952,0.087224,0.007718,0.074378,0.078714,0.067433,0.02879,-0.03127,0.032123
2,0.043411,0.0,0.013658,-0.017016,0.03577,0.094503,0.089408,0.05564,0.027294,0.097846,...,0.012853,-0.028798,0.056659,0.197835,0.090009,0.032505,0.015053,-0.017344,0.012068,0.039173
3,0.011051,0.013658,0.0,-0.059638,0.016037,-0.017158,0.016141,0.041177,-0.010093,0.023856,...,0.001615,0.000658,-0.006888,0.036157,-0.018513,-0.00624,-0.023907,0.034414,-0.009187,0.001489
4,0.059303,-0.017016,-0.059638,0.0,0.007373,-0.053929,-0.025604,0.136046,0.016082,-0.013588,...,0.011895,0.002174,-0.028,-0.025021,0.022882,-0.00596,0.279818,0.258594,0.064504,-0.019222
5,0.134514,0.03577,0.016037,0.007373,0.0,0.038484,0.067874,0.140106,0.010195,0.014335,...,0.070014,-0.070821,0.024278,0.038672,0.093567,0.051782,0.02954,0.036234,0.043318,0.099324


In [12]:
# user similarity on replacing NAN by item(movie) avg
cosine = cosine_similarity(final_movie)
np.fill_diagonal(cosine, 0 )
similarity_with_movie = pd.DataFrame(cosine,index=final_movie.index)
similarity_with_movie.columns=final_user.index
similarity_with_movie.head()

userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.843356,0.826035,0.857827,0.764312,0.779988,0.713977,0.853533,0.855259,0.816118,...,0.780533,0.83732,0.818577,0.83578,0.794697,0.851909,0.817304,0.859819,0.831436,0.745902
2,0.843356,0.0,0.927383,0.956761,0.843712,0.872662,0.804099,0.941021,0.956912,0.933291,...,0.884964,0.946499,0.906733,0.951505,0.885671,0.952297,0.909712,0.961835,0.947336,0.857147
3,0.826035,0.927383,0.0,0.93998,0.82725,0.852937,0.779676,0.923743,0.939207,0.917426,...,0.867559,0.930667,0.889823,0.926949,0.868076,0.934073,0.892905,0.94632,0.924428,0.839377
4,0.857827,0.956761,0.93998,0.0,0.855949,0.879266,0.8018,0.959257,0.974333,0.94694,...,0.898505,0.964633,0.919858,0.953728,0.900577,0.967555,0.939123,0.98232,0.963425,0.868326
5,0.764312,0.843712,0.82725,0.855949,0.0,0.768636,0.706424,0.844057,0.854108,0.82936,...,0.791333,0.837326,0.806724,0.840056,0.797082,0.851854,0.811623,0.858221,0.843267,0.771807


### Get the movies to recommend to a user

In [20]:
def get_user_similar_movies( user1, user2 ):
    common_movies = Rating_avg[Rating_avg.userId == user1].merge(
    Rating_avg[Rating_avg.userId == user2],
    on = "itemId",
    how = "inner" )
    return common_movies.merge( movie, on = 'itemId' )
   

In [35]:
a = get_user_similar_movies(370,721)
a = a.loc[ : , ['rating_x_x','rating_x_y','title']]
a.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 24
Data columns (total 34 columns):
userId_x              25 non-null int64
itemId                25 non-null int64
rating_x_x            25 non-null int64
timestamp_x           25 non-null int64
rating_y_x            25 non-null float64
adg_rating_x          25 non-null float64
userId_y              25 non-null int64
rating_x_y            25 non-null int64
timestamp_y           25 non-null int64
rating_y_y            25 non-null float64
adg_rating_y          25 non-null float64
title                 25 non-null object
release date          25 non-null object
video release date    0 non-null float64
IMDb URL              25 non-null object
unknown               25 non-null int64
Action                25 non-null int64
Adventure             25 non-null int64
Animation             25 non-null int64
Childrens             25 non-null int64
Comedy                25 non-null int64
Crime                 25 non-null int64
Docum

Unnamed: 0,rating_x_x,rating_x_y,title
0,2,3,Pulp Fiction (1994)
1,5,3,This Is Spinal Tap (1984)
2,3,5,Brazil (1985)
3,4,5,Braveheart (1995)
4,4,5,"Crying Game, The (1992)"
