In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics

In [2]:
user_cols = ['user_id',
             'age',
             'gender',
             'occupation',
             'zip_code']

users = pd.read_csv('ml-100k/u.user',
                    sep='|',
                    names=user_cols)

In [3]:
rating_cols = ['user_id',
               'movie_id',
               'rating',
               'timestamp']

ratings = pd.read_csv('ml-100k/u.data',
                      sep='\t',
                      names=rating_cols)

In [4]:
item_cols = ['movie id',
             'movie title',
             'release date',
             'video release date',
             'IMDb URL',
             'Unknown',
             'Action',
             'Adventure',
             'Animation',
             'Childrens',
             'Comedy',
             'Crime',
             'Documentary',
             'Drama',
             'Fantasy',
             'FilmNoir',
             'Horror',
             'Musical',
             'Mystery',
             'Romance',
             'SciFi',
             'Thriller',
             'War',
             'Western']

items = pd.read_csv('ml-100k/u.item',
                    sep='|',
                    names=item_cols,
                    encoding='latin-1')

In [5]:
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,Unknown,Action,Adventure,Animation,Childrens,...,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
utility = ratings.pivot(index='user_id',
                        columns='movie_id',
                        values='rating')

utility.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [7]:
userMeans = utility.mean(axis=1)
userMeans.head()

user_id
1    3.610294
2    3.709677
3    2.796296
4    4.333333
5    2.874286
dtype: float64

In [8]:
utilityCentered = utility - userMeans
utilityCentered = utilityCentered.where((pd.notnull(utilityCentered)),0)
utilityNew = utility.where((pd.notnull(utility)),0)
utilityCentered.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.709677,1.203704,-1.333333,0.125714,1.364929,0.034739,-2.79661,0.727273,-1.206522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.389706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.206522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.389706,-0.709677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
item1 = items[94:95]
item1.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,Unknown,Action,Adventure,Animation,Childrens,...,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
94,95,Aladdin (1992),01-Jan-1992,,http://us.imdb.com/M/title-exact?Aladdin%20(1992),0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0


In [10]:
feat1 = item1.iloc[:,5:24]
feat1.head()

Unnamed: 0,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
94,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [11]:
itemProfile = items.iloc[:,5:24]
itemProfile.head()

Unnamed: 0,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [17]:
userProfile = np.dot(utilityNew,itemProfile)
userProfile200 = userProfile[199]
userProfile15 = userProfile[14]
print("\nUser Profile:\n", userProfile)
print("\nUser Profile (200):\n", userProfile200)
print("\nUser Profile (15):\n", userProfile15)


User Profile:
 [[   4.  250.  123. ...,  188.   92.   22.]
 [   0.   38.   13. ...,   43.   11.    0.]
 [   0.   39.   14. ...,   53.   14.    0.]
 ..., 
 [   0.   38.   27. ...,   28.    5.    0.]
 [   0.   74.   52. ...,   80.   47.   14.]
 [   0.  227.  114. ...,  134.   53.   23.]]

User Profile (200):
 [   0.  332.  235.   66.  166.  193.   37.    2.  251.   41.   10.   44.
   72.   15.  148.  188.  201.   73.   16.]

User Profile (15):
 [   0.   59.   36.    2.   13.   75.   15.    0.  153.   10.    6.    2.
    4.   17.   86.   32.   59.   34.    0.]


In [14]:
cosine = metrics.pairwise.cosine_similarity(userProfile,feat1)

In [15]:
print("Cosine Similarity (User 200):", cosine[199])
print("Cosine Similarity (User 15): ", cosine[14])
print("Cosine Distance (User 200):  ", 1-cosine[199])
print("Cosine Distance (User 15):   ", 1-cosine[14])

Cosine Similarity (User 200): [ 0.38745727]
Cosine Similarity (User 15):  [ 0.21517341]
Cosine Distance (User 200):   [ 0.61254273]
Cosine Distance (User 15):    [ 0.78482659]
