# Recommender System : Collaborative Filtering

## Neighborhood-based Filtering
## Synthetic Data
### Utility Matirx : Binary Ratings

Item-Item Filtering Recommends a List of Items to 'Item A' using

1. Cosine Distance
2. Dot Product

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
np.set_printoptions(precision=2)

In [2]:
import os, sys

In [3]:
print( os.curdir )
print( os.getcwd() )
print( os.path.abspath( os.getcwd() ) )

.
/Users/nururrahman/myRecSystem
/Users/nururrahman/myRecSystem


## Data

In [4]:
userid = ['u1','u1','u1','u1','u1',
          'u2','u2','u2','u2',
          'u3','u3','u3','u3',
          'u4','u4','u4','u4',
          'u5','u5','u5','u5']
itemid = ['rice','flour','garlic','sugar','oil',
          'rice','lentil','salt','oil',
          'flour','onion','sugar','salt',
          'lentil','onion','garlic','oil',
          'lentil','flour','onion','sugar']
transaction = pd.DataFrame({'userid':userid, 'itemid':itemid})
transaction.head(3)

Unnamed: 0,userid,itemid
0,u1,rice
1,u1,flour
2,u1,garlic


In [5]:
trans_count = transaction.groupby(['userid', 'itemid']).size().reset_index(drop=False).rename(columns={0:'count'})
trans_count.head(3)

Unnamed: 0,userid,itemid,count
0,u1,flour,1
1,u1,garlic,1
2,u1,oil,1


#### User-Item Matrix

In [6]:
user_item = trans_count.pivot(index='userid', columns='itemid', values='count')
user_item.columns.name = ''
user_item.index.name = ''
user_item

Unnamed: 0,flour,garlic,lentil,oil,onion,rice,salt,sugar
,,,,,,,,
u1,1.0,1.0,,1.0,,1.0,,1.0
u2,,,1.0,1.0,,1.0,1.0,
u3,1.0,,,,1.0,,1.0,1.0
u4,,1.0,1.0,1.0,1.0,,,
u5,1.0,,1.0,,1.0,,,1.0


#### Transpose : Item-User Matrix

In [7]:
item_user = user_item.T
item_user.head(3)

Unnamed: 0,u1,u2,u3,u4,u5
,,,,,
flour,1.0,,1.0,,1.0
garlic,1.0,,,1.0,
lentil,,1.0,,1.0,1.0


## Part 1
## Method 1 : Item- Item Cosine Distance

#### Utility Matrix : Fill NaN values with 0

In [8]:
utility = item_user.fillna(value=0, inplace=False)
utility = utility.astype('int')
utility

Unnamed: 0,u1,u2,u3,u4,u5
,,,,,
flour,1.0,0.0,1.0,0.0,1.0
garlic,1.0,0.0,0.0,1.0,0.0
lentil,0.0,1.0,0.0,1.0,1.0
oil,1.0,1.0,0.0,1.0,0.0
onion,0.0,0.0,1.0,1.0,1.0
rice,1.0,1.0,0.0,0.0,0.0
salt,0.0,1.0,1.0,0.0,0.0
sugar,1.0,0.0,1.0,0.0,1.0


#### Item-Item Cosine Distance

In [9]:
from sklearn.metrics import pairwise_distances
cos_dist = pairwise_distances(utility.values, metric='cosine')

#### Format Results

In [10]:
items = np.unique( np.array(itemid) )
item_item = pd.DataFrame(cos_dist, index=items, columns=items)
item_item.head(3)

Unnamed: 0,flour,garlic,lentil,oil,onion,rice,salt,sugar
flour,0.0,0.591752,0.666667,0.666667,0.333333,0.591752,0.591752,0.0
garlic,0.591752,0.0,0.591752,0.183503,0.591752,0.5,1.0,0.591752
lentil,0.666667,0.591752,0.0,0.333333,0.333333,0.591752,0.591752,0.666667


#### Convert Results into Stacked Form

In [11]:
item_stacked = item_item.stack().reset_index(drop=False).rename(columns={0:'dist'})
item_stacked.head(5)

Unnamed: 0,level_0,level_1,dist
0,flour,flour,0.0
1,flour,garlic,0.591752
2,flour,lentil,0.666667
3,flour,oil,0.666667
4,flour,onion,0.333333


#### Sort rows by distance 

In [12]:
item_sorted = item_stacked.groupby(['level_0']).apply(lambda col: col.sort_values(["dist"], ascending=True))
item_sorted = item_sorted.reset_index(drop=True)
item_sorted.head(5)

Unnamed: 0,level_0,level_1,dist
0,flour,flour,0.0
1,flour,sugar,0.0
2,flour,onion,0.333333
3,flour,garlic,0.591752
4,flour,rice,0.591752


In [13]:
item_sorted.dist.values

array([0.  , 0.  , 0.33, 0.59, 0.59, 0.59, 0.67, 0.67, 0.  , 0.18, 0.5 ,
       0.59, 0.59, 0.59, 0.59, 1.  , 0.  , 0.33, 0.33, 0.59, 0.59, 0.59,
       0.67, 0.67, 0.  , 0.18, 0.18, 0.33, 0.59, 0.67, 0.67, 0.67, 0.  ,
       0.33, 0.33, 0.33, 0.59, 0.59, 0.67, 1.  , 0.  , 0.18, 0.5 , 0.5 ,
       0.59, 0.59, 0.59, 1.  , 0.  , 0.5 , 0.59, 0.59, 0.59, 0.59, 0.59,
       1.  , 0.  , 0.  , 0.33, 0.59, 0.59, 0.59, 0.67, 0.67])

#### Remove distances between an Item with Itself

In [14]:
item_sorted = item_sorted[ item_sorted.dist != 0.0 ]
item_sorted.head(5)

Unnamed: 0,level_0,level_1,dist
2,flour,onion,0.333333
3,flour,garlic,0.591752
4,flour,rice,0.591752
5,flour,salt,0.591752
6,flour,lentil,0.666667


#### Create Item Similarity Dictionary to Help Searching

In [15]:
cosineDict = {key:[] for key in items}

keys = item_sorted['level_0'].to_numpy()
vals = item_sorted['level_1'].to_numpy()

for i in range( len(keys) ):
    if keys[i] in list(cosineDict.keys()):
        cosineDict[keys[i]].append( vals[i] )
    else:
        print('the item is not in the list')
        

In [16]:
def itemSimilarity_cosine(search_item, cosineDict):
    if search_item in list(cosineDict.keys()):
        return cosineDict[search_item][0:5]
    else:
        print('The item is not in the record')

#### Recommend a List of Items to 'Item A'

In [17]:
itemA = 'flour'
itemSimilarity_cosine(itemA, cosineDict)

['onion', 'garlic', 'rice', 'salt', 'lentil']

In [18]:
"""Clean Workspace"""
del item_item, item_stacked, item_sorted

----------------------------------------------------

## Part 2 
## Method 2 : Item - Item  Dot Product

#### Utility Matrix : Fill NaN Values with 0

In [19]:
utility = user_item.fillna(value=0, inplace=False)
utility = utility.astype('int')
utility

Unnamed: 0,flour,garlic,lentil,oil,onion,rice,salt,sugar
,,,,,,,,
u1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
u2,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
u3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
u4,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
u5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


#### Item-Item Dot Product

In [20]:
dot_prod = np.dot(utility.T, utility)
np.fill_diagonal(dot_prod, 99)

#### Format Results

In [21]:
items = np.unique( np.array(itemid) )
item_item = pd.DataFrame(dot_prod, index=items, columns=items)
item_item.head(3)

Unnamed: 0,flour,garlic,lentil,oil,onion,rice,salt,sugar
flour,99,1,1,1,2,1,1,3
garlic,1,99,1,2,1,1,0,1
lentil,1,1,99,2,2,1,1,1


#### Convert Results into Stacked Form

In [22]:
item_stacked = item_item.stack().reset_index(drop=False).rename(columns={0:'dist'})
item_stacked.head(5)

Unnamed: 0,level_0,level_1,dist
0,flour,flour,99
1,flour,garlic,1
2,flour,lentil,1
3,flour,oil,1
4,flour,onion,2


#### Sort rows by distance

In [23]:
item_sorted = item_stacked.groupby(['level_0']).apply(lambda col: col.sort_values(["dist"], ascending=False))
item_sorted = item_sorted.reset_index(drop=True)
item_sorted.head(5)

Unnamed: 0,level_0,level_1,dist
0,flour,flour,99
1,flour,sugar,3
2,flour,onion,2
3,flour,garlic,1
4,flour,lentil,1


In [24]:
item_sorted.dist.values

array([99,  3,  2,  1,  1,  1,  1,  1, 99,  2,  1,  1,  1,  1,  1,  0, 99,
        2,  2,  1,  1,  1,  1,  1, 99,  2,  2,  2,  1,  1,  1,  1, 99,  2,
        2,  2,  1,  1,  1,  0, 99,  2,  1,  1,  1,  1,  1,  0, 99,  1,  1,
        1,  1,  1,  1,  0, 99,  3,  2,  1,  1,  1,  1,  1])

#### Remove distances between an Item with Itself

In [25]:
item_sorted = item_sorted[ item_sorted.dist != 99 ]
item_sorted.head(5)

Unnamed: 0,level_0,level_1,dist
1,flour,sugar,3
2,flour,onion,2
3,flour,garlic,1
4,flour,lentil,1
5,flour,oil,1


#### Create Item Similarity Dictionary to Help Searching

In [26]:
dotprodDict = {key:[] for key in items}

keys = item_sorted['level_0'].to_numpy()
vals = item_sorted['level_1'].to_numpy()

for i in range( len(keys) ):
    if keys[i] in list(dotprodDict.keys()):
        dotprodDict[keys[i]].append( vals[i] )
    else:
        print('the item is not in the list')

In [27]:
def itemSimilarity_dotprod(search_item, dotprodDict):
    if search_item in list(dotprodDict.keys()):
        return dotprodDict[search_item][0:5]
    else:
        print('The item is not in the record')

#### Recommend a List of Items to 'Item A'

In [28]:
itemA = 'flour'
itemSimilarity_dotprod(itemA, dotprodDict)

['sugar', 'onion', 'garlic', 'lentil', 'oil']

In [29]:
"""Clean Workspace"""
del item_item, item_stacked, item_sorted

--------------------------------------------------------------

## Test Recommendations from Two Different Rec Systems

In [30]:
itemA = 'sugar'
print( itemSimilarity_cosine(itemA, cosineDict) )
print( itemSimilarity_dotprod(itemA, dotprodDict) )

['onion', 'garlic', 'rice', 'salt', 'lentil']
['flour', 'onion', 'garlic', 'lentil', 'oil']


In [31]:
itemA = 'onion'
print( itemSimilarity_cosine(itemA, cosineDict) )
print( itemSimilarity_dotprod(itemA, dotprodDict) )

['flour', 'lentil', 'sugar', 'garlic', 'salt']
['flour', 'lentil', 'sugar', 'garlic', 'oil']


In [32]:
itemA = 'lentil'
print( itemSimilarity_cosine(itemA, cosineDict) )
print( itemSimilarity_dotprod(itemA, dotprodDict) )

['oil', 'onion', 'garlic', 'rice', 'salt']
['oil', 'onion', 'flour', 'garlic', 'rice']
