# importing packages


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# reading data

In [2]:
header = ['user_id','item_id','rating','timestamp']
df=pd.read_csv('ml-100k/u.data',sep='\t',names=header)

In [3]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
#unique users
n_users=df['user_id'].nunique()
print(n_users)

943


In [5]:
#unique movies
n_items=df['item_id'].nunique()
print(n_items)

1682


In [6]:
#train_test_split
from sklearn.cross_validation import train_test_split
train,test = train_test_split(df,test_size=0.25,random_state=0)



In [7]:
header1=['user_id','age','gender','occupation','zip code']
user_details=pd.read_csv('ml-100k/u.user',delimiter='|',names=header1)

In [8]:
user_details.head()

Unnamed: 0,user_id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [9]:
header2=['movie_id','movie_title','release_date','video_release_date','IMDb_URL','unknown','Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy',
        'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movie_details=pd.read_csv('ml-100k/u.item',sep='|',encoding='latin',names=header2)

In [10]:
movie_details.drop('video_release_date',axis=1,inplace=True)

In [11]:
movie_details.head()

Unnamed: 0,movie_id,movie_title,release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
train_matrix=np.zeros((n_users,n_items))

In [13]:
train_matrix.shape

(943, 1682)

In [14]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [15]:
train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
2606,208,88,5,883108324
57228,676,902,4,892685740
8382,374,231,2,880939228
60281,378,566,3,880045856
48432,726,845,3,889832358


In [16]:
train.reset_index(drop=True,inplace=True)

In [17]:
test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
3582,23,528,4,874786974
60498,695,242,5,888805837
53227,774,28,3,888556698
21333,417,550,3,879649178
3885,234,1035,3,892335142


In [18]:
test.reset_index(drop=True,inplace=True)

In [19]:
for i in range(0,train.shape[0]):
    user=train.loc[i]['user_id']-1
    item=train.loc[i]['item_id']-1
    rating=train.loc[i]['rating']
    train_matrix[user,item]=rating

In [20]:
train_matrix

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

# user similarity based collaborative filtering

In [21]:
#cosine user-item similarity
from sklearn.metrics.pairwise import pairwise_distances
user_similarity=pairwise_distances(train_matrix,metric='cosine')

In [22]:
user_similarity

array([[ 0.        ,  0.85911263,  0.96301907, ...,  0.91032902,
         0.85885377,  0.66871691],
       [ 0.85911263,  0.        ,  0.90275428, ...,  0.92026073,
         0.85022457,  0.88139105],
       [ 0.96301907,  0.90275428,  0.        , ...,  0.90883637,
         0.87352037,  0.96262931],
       ..., 
       [ 0.91032902,  0.92026073,  0.90883637, ...,  0.        ,
         0.89514991,  0.89212373],
       [ 0.85885377,  0.85022457,  0.87352037, ...,  0.89514991,
         0.        ,  0.88108541],
       [ 0.66871691,  0.88139105,  0.96262931, ...,  0.89212373,
         0.88108541,  0.        ]])

In [23]:
mean_user_rating=train_matrix.mean(axis=1)

In [24]:
print(mean_user_rating)

[ 0.42390012  0.10225922  0.06123662  0.05469679  0.21700357  0.33472057
  0.73840666  0.09571938  0.04756243  0.34720571  0.28180737  0.11296076
  0.88703924  0.16706302  0.13198573  0.27526754  0.03269917  0.46789536
  0.02497027  0.07193817  0.20214031  0.21343639  0.23959572  0.12366231
  0.14268728  0.14209275  0.03388823  0.12782402  0.05410226  0.06777646
  0.06064209  0.05469679  0.04161712  0.03269917  0.03686088  0.03745541
  0.09750297  0.21224732  0.03567182  0.04102259  0.08561237  0.28894174
  0.35850178  0.2372176   0.08680143  0.06183115  0.04637337  0.1058264
  0.23246136  0.0332937   0.03032105  0.10107015  0.05053508  0.12425684
  0.03269917  0.32580262  0.16646849  0.26456599  0.67122473  0.38049941
  0.03269917  0.35077289  0.12485137  0.33650416  0.14030916  0.05410226
  0.04875149  0.04994055  0.10047562  0.19857313  0.06420927  0.22711058
  0.10285375  0.06123662  0.11533888  0.14149822  0.11890606  0.03091558
  0.08739596  0.04102259  0.08858502  0.22889417  0.

In [26]:
diff=train_matrix-mean_user_rating[:,np.newaxis]

In [27]:
predictions=mean_user_rating[:,np.newaxis]+user_similarity.dot(diff)/np.array([np.abs(user_similarity).sum(axis=1)]).T

In [28]:
predictions

array([[ 1.55761338,  0.55001439,  0.45264786, ...,  0.27276193,
         0.2751335 ,  0.27502667],
       [ 1.33672239,  0.28091445,  0.13725968, ..., -0.06722227,
        -0.06394202, -0.06373785],
       [ 1.31775749,  0.2361101 ,  0.10236985, ..., -0.10826225,
        -0.10501474, -0.10490476],
       ..., 
       [ 1.22189231,  0.20533654,  0.06823778, ..., -0.12834738,
        -0.12527257, -0.12516588],
       [ 1.37500469,  0.31023194,  0.19818487, ..., -0.00694323,
        -0.00419427, -0.00380607],
       [ 1.4145586 ,  0.39035534,  0.29106064, ...,  0.10930852,
         0.11177964,  0.11183897]])

In [29]:
from sklearn import metrics
from math import sqrt

In [30]:
test_matrix=np.zeros((n_users,n_items))
for i in range(0,test.shape[0]):
    user=test.loc[i]['user_id']-1
    item=test.loc[i]['item_id']-1
    rating=test.loc[i]['rating']
    test_matrix[user,item]=rating

In [31]:
#RMSE
predict=predictions[test_matrix.nonzero()].flatten()
truth=test_matrix[test_matrix.nonzero()].flatten()
print("Error:")
print(sqrt(metrics.mean_squared_error(predict,truth)))

Error:
3.120061571425749


# item similarity based collaborative filtering

In [32]:
item_similarity=pairwise_distances(train_matrix.T,metric='cosine')

In [33]:
item_similarity.shape

(1682, 1682)

In [34]:
item_similarity

array([[ 0.        ,  0.63858615,  0.71961953, ...,  1.        ,
         0.94544045,  0.94544045],
       [ 0.63858615,  0.        ,  0.76539316, ...,  1.        ,
         0.90925741,  0.90925741],
       [ 0.71961953,  0.76539316,  0.        , ...,  1.        ,
         1.        ,  0.88537589],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         1.        ,  1.        ],
       [ 0.94544045,  0.90925741,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 0.94544045,  0.90925741,  0.88537589, ...,  1.        ,
         1.        ,  0.        ]])

In [35]:
predictions1=train_matrix.dot(item_similarity)/np.array([np.abs(item_similarity).sum(axis=1)])

In [36]:
predictions1.shape

(943, 1682)

In [37]:
predictions1

array([[ 0.34861052,  0.36322961,  0.38026644, ...,  0.42415229,
         0.41718186,  0.41315619],
       [ 0.08917586,  0.09996194,  0.09586072, ...,  0.10232005,
         0.10226914,  0.10273929],
       [ 0.05861111,  0.06223505,  0.059921  , ...,  0.06127305,
         0.06119454,  0.06184447],
       ..., 
       [ 0.02844689,  0.03448916,  0.03352695, ...,  0.03866746,
         0.03793393,  0.03836359],
       [ 0.13461872,  0.1406997 ,  0.14912976, ...,  0.15526472,
         0.15275802,  0.15501734],
       [ 0.2118463 ,  0.21274084,  0.23097108, ...,  0.26293873,
         0.2564759 ,  0.25720922]])

In [38]:
predict1=predictions1[test_matrix.nonzero()].flatten()
truth=test_matrix[test_matrix.nonzero()].flatten()

In [39]:
print("Error:")
print(sqrt(metrics.mean_squared_error(predict1,truth)))

Error:
3.4472573910990616
