# Recommender System : Collaborative Filtering
### Utility Matrix : Ordinal Ratings 

#### Part 1 : User-User Filtering 
1. Pearson Correlation Measure

#### Part 2 : Item-Item Filtering
1. Cosine Similarity Measure 

#### Reference : Recommender System, The Textbook : Charu Aggarwal

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os, sys

## Data

In [3]:
userid = ['user1','user1','user1','user1','user1','user1',
          'user2','user2','user2','user2','user2',
          'user3','user3','user3','user3',
          'user4','user4','user4','user4','user4','user4',
          'user5','user5','user5','user5','user5']
itemid = ['item1','item2','item3','item4','item5','item6',
          'item1','item2','item4','item5','item6',
          'item2','item3','item4','item5',
          'item1','item2','item3','item4','item5','item6',
          'item1','item3','item4','item5','item6',]
rating = [7,6,7,4,5,4,
         6,7,4,3,4,
         3,3,1,1,
         1,2,2,3,3,4,
         1,1,2,3,3]

In [4]:
transaction = pd.DataFrame({'userid':userid, 'itemid':itemid, 'rating':rating})
transaction.head(3)

Unnamed: 0,userid,itemid,rating
0,user1,item1,7
1,user1,item2,6
2,user1,item3,7


In [5]:
user_item = transaction.pivot(index='userid', columns='itemid', values='rating')
user_item.columns.name = ''
user_item.index.name = ''
user_item

Unnamed: 0,item1,item2,item3,item4,item5,item6
,,,,,,
user1,7.0,6.0,7.0,4.0,5.0,4.0
user2,6.0,7.0,,4.0,3.0,4.0
user3,,3.0,3.0,1.0,1.0,
user4,1.0,2.0,2.0,3.0,3.0,4.0
user5,1.0,,1.0,2.0,3.0,3.0


In [63]:
adjusted_user_item = user_item.apply(lambda x : x-np.nanmean(x), axis=1)
adjusted_user_item

Unnamed: 0,item1,item2,item3,item4,item5,item6
,,,,,,
user1,1.5,0.5,1.5,-1.5,-0.5,-1.5
user2,1.2,2.2,,-0.8,-1.8,-0.8
user3,,1.0,1.0,-1.0,-1.0,
user4,-1.5,-0.5,-0.5,0.5,0.5,1.5
user5,-1.0,,-1.0,0.0,1.0,1.0


#### Check : User-User Similarity

In [6]:
# 3 - 1
u31 = 6*3 + 7*3 + 4*1 + 5*1
u1 = np.sqrt(6**2 +7**2 +4**2 +5**2)
u3 = np.sqrt(3**2 +3**2 +1**2 +1**2)
round( u31/(u3*u1),3 )

0.956

In [7]:
# 3 - 2
u32 = 7*3 + 0*3 + 4*1 + 3*1
u2 = np.sqrt(7**2 +0**2 +4**2 +3**2)
u3 = np.sqrt(3**2 +0**2 +1**2 +1**2)
round( u32/(u3*u2),3 )

0.981

In [8]:
# 3 - 4
u34 = 2*3 + 2*3 + 3*1 + 3*1
u4 = np.sqrt(2**2 +2**2 +3**2 +3**2)
u3 = np.sqrt(3**2 +3**2 +1**2 +1**2)
round( u34/(u3*u4),3)

0.789

In [9]:
# 3 - 5
u35 = 0*3 + 1*3 + 2*1 + 3*1
u5 = np.sqrt(1**2 +0**2 +2**2 +3**2)
u3 = np.sqrt(0**2 +3**2 +1**2 +1**2)
round( u35/(u3*u5),3)

0.645

#### Check : Item-Item Similarity

In [10]:
i51 = 5*7 + 3*6 + 3*1 + 3*1
i5 = np.sqrt( 5**2 + 3**2 + 3**2 + 3**2  )
i1 = np.sqrt( 7**2 + 6**2 + 1**2 + 1**2  )
print( round( i51/(i5*i1),4)  )

0.8772


In [11]:
i52 = 5*6 + 3*7 + 1*3 + 3*2
i5 = np.sqrt( 5**2 + 3**2 + 1**2 + 3**2  )
i2 = np.sqrt( 6**2 + 7**2 + 3**2 + 2**2  )
print( round( i52/(i5*i2),4)  )

0.9137


In [12]:
i53 = 5*7 + 1*3 + 3*2 + 3*1
i5 = np.sqrt( 5**2 + 1**2 + 3**2 + 3**2  )
i3 = np.sqrt( 7**2 + 3**2 + 2**2 + 1**2  )
print( round( i53/(i5*i3),4)  )

0.8927


In [13]:
i54 = 5*4 + 3*4 + 1*1 + 3*3 + 3*2
i5 = np.sqrt( 5**2 + 3**2 + 1**2 + 3**2 + 3**2  )
i4 = np.sqrt( 4**2 + 4**2 + 1**2 + 3**2 + 2**2 )
print( round( i54/(i5*i4),4)  )

0.9721


In [14]:
i56 = 5*4 + 3*4 + 3*4 + 3*3
i5 = np.sqrt( 5**2 + 3**2 + 3**2 + 3**2  )
i6 = np.sqrt( 4**2 + 4**2 + 4**2 + 3**2 )
print( round( i56/(i5*i6),4)  )

0.9735


## Similarity Measure

In [15]:
a = np.array([3,3,1,5,2])
b = np.array([np.nan,1,4,3,np.nan])
print(a)
print(b)

[3 3 1 5 2]
[nan  1.  4.  3. nan]


In [16]:
print( a-np.nanmean(a) )
print( b-np.nanmean(b) )

[ 0.2  0.2 -1.8  2.2 -0.8]
[        nan -1.66666667  1.33333333  0.33333333         nan]


In [17]:
def findSimilarity(a,b, mean_adjusted):
    
    if mean_adjusted:
        a = a - np.nanmean(a)
        b = b - np.nanmean(b)
        
    find_nan1 = np.argwhere( np.isnan(a) )
    #print(find_nan1, find_nan1.shape, len(find_nan1))
    
    if len(find_nan1)>0:
        nan1 = find_nan1.flatten()
    else:
        nan1 = np.array([],  dtype='int8')

    find_nan2 = np.argwhere( np.isnan(b) )
    #print(find_nan2, find_nan2.shape, len(find_nan2))
    
    if len(find_nan2)>0:
        nan2 = find_nan2.flatten()
    else:
        nan2 = np.array([],  dtype='int8')

    nan_indices = np.append( nan1, nan2 )

    _a = np.delete(a, nan_indices)
    _b = np.delete(b, nan_indices)
    
    num  = np.sum( np.product((_a,_b), axis=0) )
    dnum = np.sqrt( np.dot(_a,_a) ) * np.sqrt( np.dot(_b,_b) )
    sim  = round(num/dnum, 4)

    #print(sim)
    return sim

## Part 1: Find User Similarity

In [59]:
userlen = user_item.shape[0]
sim = np.zeros(userlen*userlen).reshape(userlen,userlen)
mean_adjusted = True

for i in range(userlen):
    for j in range(userlen):
        a = user_item.values[i]
        b = user_item.values[j]
        sim[i,j] = findSimilarity(a,b, mean_adjusted)

In [60]:
np.set_printoptions(precision=4)
print( sim )
user_user = sim

[[ 1.      0.7007  0.8944 -0.8992 -0.822 ]
 [ 0.7007  1.      0.9385 -0.7171 -0.8987]
 [ 0.8944  0.9385  1.     -1.     -0.8165]
 [-0.8992 -0.7171 -1.      1.      0.8729]
 [-0.822  -0.8987 -0.8165  0.8729  1.    ]]


In [61]:
user_list = user_item.index.to_list()
user_user = pd.DataFrame( user_user, index=user_list, columns=user_list)
user_stacked = user_user.stack().reset_index(drop=False).rename(columns={0:'dist'})
user_stacked.head(3)

Unnamed: 0,level_0,level_1,dist
0,user1,user1,1.0
1,user1,user2,0.7007
2,user1,user3,0.8944


In [21]:
user_sorted = user_stacked.groupby(['level_0']).apply(lambda col: col.sort_values(["dist"], ascending=False))
user_sorted = user_sorted.reset_index(drop=True)
user_clipped = user_sorted[ user_sorted.dist != 1.0 ]
user_clipped = user_clipped.reset_index(drop=True)
user_clipped.head(3)

Unnamed: 0,level_0,level_1,dist
0,user1,user3,0.8944
1,user1,user2,0.7007
2,user1,user5,-0.822


#### User Similarity Dictionary

In [22]:
pearsonDict = {key:[] for key in user_list}

keys = user_clipped['level_0'].to_numpy()
vals = user_clipped['level_1'].to_numpy()
dist = user_clipped['dist'].to_numpy()

for i in range( len(keys) ):
    if keys[i] in list(pearsonDict.keys()):
        pearsonDict[keys[i]].append( (vals[i], round(dist[i],4) ) )
    else:
        print('The user is not in the record')
        

In [23]:
def userSimilarity_pearson(euclideanDict, search_item):
    if search_item in list(euclideanDict.keys()):
        return euclideanDict[search_item]
    else:
        print('The user is not in the record')

In [24]:
user_weight = userSimilarity_pearson(pearsonDict, 'user3')
user_weight

[('user2', 0.9385), ('user1', 0.8944), ('user5', -0.8165), ('user4', -1.0)]

In [25]:
#del user_user, user_stacked, user_sorted

----------------------------------------------------

## Part 2: Find Item Similarity

In [51]:
adjusted_user_item = user_item.apply(lambda x : x-np.nanmean(x), axis=1)

In [52]:
# user_item.T.apply(lambda x : x-np.nanmean(x), axis=1)

In [57]:
item_user = adjusted_user_item.T
itemlen   = item_user.shape[0]
sim = np.zeros(itemlen*itemlen).reshape(itemlen,itemlen)
mean_adjusted = False

for i in range(itemlen):
    for j in range(itemlen):
        a = item_user.values[i]
        b = item_user.values[j]
        sim[i,j] = findSimilarity(a,b, mean_adjusted)

In [58]:
np.set_printoptions(precision=4)
print( sim )
item_item = sim

[[ 1.      0.7351  0.9117 -0.8483 -0.8125 -0.9896]
 [ 0.7351  1.      0.8729 -0.7339 -0.996  -0.6223]
 [ 0.9117  0.8729  1.     -0.8819 -0.8944 -0.9117]
 [-0.8483 -0.7339 -0.8819  1.      0.7057  0.829 ]
 [-0.8125 -0.996  -0.8944  0.7057  1.      0.7303]
 [-0.9896 -0.6223 -0.9117  0.829   0.7303  1.    ]]


In [28]:
item_list = user_item.columns.to_list()
item_item = pd.DataFrame( item_item, index=item_list, columns=item_list)
item_stacked = item_item.stack().reset_index(drop=False).rename(columns={0:'dist'})
item_stacked.head(3)

Unnamed: 0,level_0,level_1,dist
0,item1,item1,1.0
1,item1,item2,0.9395
2,item1,item3,0.9462


In [29]:
item_sorted = item_stacked.groupby(['level_0']).apply(lambda col: col.sort_values(["dist"], ascending=False))
item_sorted = item_sorted.reset_index(drop=True)
item_sorted = item_sorted[ item_sorted.dist != 1.0 ]
item_sorted.head(3)

Unnamed: 0,level_0,level_1,dist
1,item1,item3,0.9462
2,item1,item2,0.9395
3,item1,item4,0.7885


#### User Similarity Dictionary

In [30]:
cosineDict = {key:[] for key in item_list}

keys = item_sorted['level_0'].to_numpy()
vals = item_sorted['level_1'].to_numpy()
dist = item_sorted['dist'].to_numpy()

for i in range( len(keys) ):
    if keys[i] in list(cosineDict.keys()):
        cosineDict[keys[i]].append( (vals[i], round(dist[i],4)) )
    else:
        print('Cosine Measure : The item is not in the list')

In [31]:
def userSimilarity_cosine(cosineDict, search_item):
    if search_item in list(cosineDict.keys()):
        return cosineDict[search_item]
    else:
        print('Cosine Measure : The item is not in the record')

In [32]:
item_weight = userSimilarity_cosine(cosineDict, 'item2')
item_weight

[('item1', 0.9395),
 ('item3', 0.7027),
 ('item4', 0.684),
 ('item5', 0.5145),
 ('item6', 0.2255)]

In [33]:
#del item_item, item_stacked, item_sorted