# Recommender System : Collaborative Filtering

## Neighborhood-based Filtering
## Synthetic Data
### Utility Matrix : Ordinal Ratings 

#### Part 1 : User-User Filtering

1. Euclidean Distance
2. Pearson Correlation Measure

#### Part 2 : Item Recommendation

1. Using Rating Thresholds
2. Using Distance Weights

#### Reference : Programming Collective Intelligence, Toby Segaran

In [1]:
import numpy as np
import pandas as pd
np.set_printoptions(precision=2)

In [2]:
import os, sys

## Data

In [3]:
critics_dict = {
    'Lisa Rose': {
        'Lady in the Water': 2.5, 
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0, 
        'Superman Returns': 3.5, 
        'You, Me and Dupree': 2.5, 
        'The Night Listener': 3.0
                 },
    'Gene Seymour': {
        'Lady in the Water': 3.0, 
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5, 
        'Superman Returns': 5.0, 
        'The Night Listener': 3.0,
        'Lulu and the River': 1.0, 
        'You, Me and Dupree': 3.5
    },
    'Michael Phillips': {
        'Lady in the Water': 2.5, 
        'Lulu and the River': 2.0,
        'Snakes on a Plane': 3.0,
        'Superman Returns': 3.5, 
        'The Night Listener': 4.0
    },
    'Claudia Puig': {
        'Snakes on a Plane': 3.5, 
        'Just My Luck': 3.0,
        'The Night Listener': 4.5, 
        'Superman Returns': 4.0,
        'You, Me and Dupree': 2.5
    },
    'Mick LaSalle': {
        'Lady in the Water': 3.0, 
        'Snakes on a Plane': 4.0,
        'Just My Luck': 2.0, 
        'Superman Returns': 3.0, 
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0
    },
    'Jack Matthews': {
        'Lady in the Water': 3.0, 
        'Snakes on a Plane': 4.0,
        'The Night Listener': 3.0, 
        'Superman Returns': 5.0, 
        'You, Me and Dupree': 3.5
    }, 
    'Toby Segaran': {
        'Snakes on a Plane':4.5,
        'You, Me and Dupree':1.0,
        'Superman Returns':4.0
    }
}

In [4]:
critic_list = list( critics_dict.keys() )
critic_list

['Lisa Rose',
 'Gene Seymour',
 'Michael Phillips',
 'Claudia Puig',
 'Mick LaSalle',
 'Jack Matthews',
 'Toby Segaran']

In [5]:
s1 = set( critics_dict['Lisa Rose'].keys() )
s2 = set( critics_dict['Gene Seymour'].keys() )

In [6]:
s1.intersection(s2)

{'Just My Luck',
 'Lady in the Water',
 'Snakes on a Plane',
 'Superman Returns',
 'The Night Listener',
 'You, Me and Dupree'}

### Convert Data from Dictionary to DataFrame

In [7]:
critics_data = pd.json_normalize(critics_dict, sep='_').T.reset_index()
critics_data = critics_data.rename(columns={'index':'user_item', 0:'rating'})

In [8]:
critics_data['critics'] = critics_data.apply(lambda x : x[0].split('_')[0], axis=1)
critics_data['movies']  = critics_data.apply(lambda x : x[0].split('_')[1], axis=1)
critics_data = critics_data[['critics','movies','rating']]
critics_data.head()

Unnamed: 0,critics,movies,rating
0,Lisa Rose,Lady in the Water,2.5
1,Lisa Rose,Snakes on a Plane,3.5
2,Lisa Rose,Just My Luck,3.0
3,Lisa Rose,Superman Returns,3.5
4,Lisa Rose,"You, Me and Dupree",2.5


### Re-shape Data

In [9]:
user_item = critics_data.pivot(index='critics', columns='movies', values='rating')

#### Utility Matrix : Fill NaN Values with 0

In [10]:
utility = user_item.fillna(value=0, inplace=False)
utility

movies,Just My Luck,Lady in the Water,Lulu and the River,Snakes on a Plane,Superman Returns,The Night Listener,"You, Me and Dupree"
critics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Claudia Puig,3.0,0.0,0.0,3.5,4.0,4.5,2.5
Gene Seymour,1.5,3.0,1.0,3.5,5.0,3.0,3.5
Jack Matthews,0.0,3.0,0.0,4.0,5.0,3.0,3.5
Lisa Rose,3.0,2.5,0.0,3.5,3.5,3.0,2.5
Michael Phillips,0.0,2.5,2.0,3.0,3.5,4.0,0.0
Mick LaSalle,2.0,3.0,0.0,4.0,3.0,3.0,2.0
Toby Segaran,0.0,0.0,0.0,4.5,4.0,0.0,1.0


#### User List

In [11]:
user_list = utility.index.to_numpy()
print(user_list)

['Claudia Puig' 'Gene Seymour' 'Jack Matthews' 'Lisa Rose'
 'Michael Phillips' 'Mick LaSalle' 'Toby Segaran']


----------------------------------------------------------------------

## Part 1 : User Similarity Measure
### Method 1 : Pairwise Euclidean Distance among Users

In [12]:
from sklearn.metrics import pairwise_distances
euclidean_dist = pairwise_distances(utility.values, utility.values, metric='euclidean', force_all_finite='allow-nan')

In [13]:
euclidean_dist

array([[0.  , 4.06, 4.74, 2.96, 5.12, 3.71, 5.7 ],
       [4.06, 0.  , 1.87, 2.6 , 4.39, 2.78, 5.43],
       [4.74, 1.87, 0.  , 3.57, 4.56, 3.2 , 5.05],
       [2.96, 2.6 , 3.57, 0.  , 4.53, 1.41, 5.27],
       [5.12, 4.39, 4.56, 4.53, 0.  , 3.81, 5.45],
       [3.71, 2.78, 3.2 , 1.41, 3.81, 0.  , 4.92],
       [5.7 , 5.43, 5.05, 5.27, 5.45, 4.92, 0.  ]])

#### Re-scale Pairwise Distances in-between 0 and 1 

0 = Dissimilar
1 = Most Similar

In [14]:
np.set_printoptions(precision=4)
rescaled_dist = 1/(1 + euclidean_dist)
rescaled_dist

array([[1.    , 0.1975, 0.1741, 0.2527, 0.1633, 0.2124, 0.1492],
       [0.1975, 1.    , 0.3483, 0.2779, 0.1856, 0.2643, 0.1555],
       [0.1741, 0.3483, 1.    , 0.2188, 0.18  , 0.238 , 0.1653],
       [0.2527, 0.2779, 0.2188, 1.    , 0.1809, 0.4142, 0.1595],
       [0.1633, 0.1856, 0.18  , 0.1809, 1.    , 0.208 , 0.1549],
       [0.2124, 0.2643, 0.238 , 0.4142, 0.208 , 1.    , 0.1688],
       [0.1492, 0.1555, 0.1653, 0.1595, 0.1549, 0.1688, 1.    ]])

In [15]:
user_user = pd.DataFrame(rescaled_dist, index=user_list, columns=user_list)
user_user.head(5)

Unnamed: 0,Claudia Puig,Gene Seymour,Jack Matthews,Lisa Rose,Michael Phillips,Mick LaSalle,Toby Segaran
Claudia Puig,1.0,0.19755,0.174112,0.25265,0.163306,0.2124,0.149234
Gene Seymour,0.19755,1.0,0.348331,0.277926,0.185615,0.264279,0.155487
Jack Matthews,0.174112,0.348331,1.0,0.218784,0.180011,0.238007,0.165296
Lisa Rose,0.25265,0.277926,0.218784,1.0,0.180907,0.414214,0.159545
Michael Phillips,0.163306,0.185615,0.180011,0.180907,1.0,0.207992,0.154934


#### Convert Results into Stacked Form

In [16]:
user_stacked = user_user.stack().reset_index(drop=False).rename(columns={0:'dist'})
user_stacked.head(5)

Unnamed: 0,level_0,level_1,dist
0,Claudia Puig,Claudia Puig,1.0
1,Claudia Puig,Gene Seymour,0.19755
2,Claudia Puig,Jack Matthews,0.174112
3,Claudia Puig,Lisa Rose,0.25265
4,Claudia Puig,Michael Phillips,0.163306


#### Sort rows by distance

In [17]:
user_sorted = user_stacked.groupby(['level_0']).apply(lambda col: col.sort_values(["dist"], ascending=False))
user_sorted = user_sorted.reset_index(drop=True)

#### Remove distances between an User with Itself

In [18]:
user_clipped = user_sorted[ (1.0-user_sorted.dist) > 1.0E-04 ]
user_clipped = user_clipped.reset_index(drop=True)
user_clipped.head(5)

Unnamed: 0,level_0,level_1,dist
0,Claudia Puig,Lisa Rose,0.25265
1,Claudia Puig,Mick LaSalle,0.2124
2,Claudia Puig,Gene Seymour,0.19755
3,Claudia Puig,Jack Matthews,0.174112
4,Claudia Puig,Michael Phillips,0.163306


#### Create User Similarity Dictionary to Help Searching

In [19]:
euclideanDict = {key:[] for key in user_list}

keys = user_clipped['level_0'].to_numpy()
vals = user_clipped['level_1'].to_numpy()
dist = user_clipped['dist'].to_numpy()

for i in range( len(keys) ):
    if keys[i] in list(euclideanDict.keys()):
        euclideanDict[keys[i]].append( (vals[i], round(dist[i],6) ) )
    else:
        print('The user is not in the record')
        

In [20]:
def userSimilarity_euclidean(euclideanDict, search_item):
    if search_item in list(euclideanDict.keys()):
        theValue =  euclideanDict[search_item]          # returning the entire list
        # theValue =  euclideanDict[search_item][0:5]   # returning the top five of list   
        return theValue
    else:
        print('The user is not in the record')

In [21]:
userA = 'Lisa Rose'
user_weight = userSimilarity_euclidean(euclideanDict, userA)
user_weight

[('Mick LaSalle', 0.414214),
 ('Gene Seymour', 0.277926),
 ('Claudia Puig', 0.25265),
 ('Jack Matthews', 0.218784),
 ('Michael Phillips', 0.180907),
 ('Toby Segaran', 0.159545)]

In [22]:
"""Clean Workspace"""
del user_user, user_stacked, user_sorted

----------------------------------------------------

## Part 1 : User Similarity Measure
### Method 2 : Pearson Correlation among Critics

In [23]:
pearson_dist = np.corrcoef(utility.values)
pearson_dist

array([[1.    , 0.5294, 0.4729, 0.7575, 0.2853, 0.5739, 0.4757],
       [0.5294, 1.    , 0.9738, 0.7023, 0.4343, 0.7071, 0.7075],
       [0.4729, 0.9738, 1.    , 0.6576, 0.505 , 0.7513, 0.7025],
       [0.7575, 0.7023, 0.6576, 1.    , 0.2099, 0.9006, 0.5369],
       [0.2853, 0.4343, 0.505 , 0.2099, 1.    , 0.4561, 0.3693],
       [0.5739, 0.7071, 0.7513, 0.9006, 0.4561, 1.    , 0.5804],
       [0.4757, 0.7075, 0.7025, 0.5369, 0.3693, 0.5804, 1.    ]])

In [24]:
user_user = pd.DataFrame(pearson_dist, index=user_list, columns=user_list)
user_user.head(5)

Unnamed: 0,Claudia Puig,Gene Seymour,Jack Matthews,Lisa Rose,Michael Phillips,Mick LaSalle,Toby Segaran
Claudia Puig,1.0,0.529426,0.472908,0.757476,0.285299,0.573944,0.475731
Gene Seymour,0.529426,1.0,0.973789,0.70226,0.43434,0.707141,0.707452
Jack Matthews,0.472908,0.973789,1.0,0.657584,0.504987,0.75126,0.702479
Lisa Rose,0.757476,0.70226,0.657584,1.0,0.209933,0.900552,0.536855
Michael Phillips,0.285299,0.43434,0.504987,0.209933,1.0,0.456148,0.369321


#### Convert Results into Stacked Form

In [25]:
user_stacked = user_user.stack().reset_index(drop=False).rename(columns={0:'dist'})
user_stacked.head(5)

Unnamed: 0,level_0,level_1,dist
0,Claudia Puig,Claudia Puig,1.0
1,Claudia Puig,Gene Seymour,0.529426
2,Claudia Puig,Jack Matthews,0.472908
3,Claudia Puig,Lisa Rose,0.757476
4,Claudia Puig,Michael Phillips,0.285299


#### Sort rows by distance 

In [26]:
user_sorted = user_stacked.groupby(['level_0']).apply(lambda col: col.sort_values(["dist"], ascending=False))
user_sorted = user_sorted.reset_index(drop=True)

#### Remove distances between an User with Itself

In [27]:
user_clipped = user_sorted[ (1.0-user_sorted.dist) > 1.0E-04 ]
user_clipped = user_clipped.reset_index(drop=True)
user_clipped.head(5)

Unnamed: 0,level_0,level_1,dist
0,Claudia Puig,Lisa Rose,0.757476
1,Claudia Puig,Mick LaSalle,0.573944
2,Claudia Puig,Gene Seymour,0.529426
3,Claudia Puig,Toby Segaran,0.475731
4,Claudia Puig,Jack Matthews,0.472908


#### Create User Similarity Dictionary to Help Searching
for a given test user, this dictionary will store its distances from all other users 

In [28]:
pearsonDict = {key:[] for key in user_list}

keys = user_clipped['level_0'].to_numpy()
vals = user_clipped['level_1'].to_numpy()
dist = user_clipped['dist'].to_numpy()

for i in range( len(keys) ):
    if keys[i] in list(pearsonDict.keys()):
        pearsonDict[keys[i]].append( (vals[i], round(dist[i],4)) )
    else:
        print('Pearson Correlation : The item is not in the list')

In [29]:
def userSimilarity_pearson(pearsonDict, search_item):
    """
    inputs
    search_item  : The key to search userSimilarity dictoinary 
    pearsonDict  : The userSimilarity dictoinary 
    
    outputs
    theValue     : A list, the value corresponding to the key
    """
    
    if search_item in list(pearsonDict.keys()):
        theValue =  pearsonDict[search_item]          # returning the entire list
        # theValue =  pearsonDict[search_item][0:5]   # returning the top five of list      
        return theValue
    else:
        print('Pearson Correlation : The item is not in the record')

In [30]:
userA = 'Lisa Rose'
user_weight = userSimilarity_pearson(pearsonDict, userA)
user_weight

[('Mick LaSalle', 0.9006),
 ('Claudia Puig', 0.7575),
 ('Gene Seymour', 0.7023),
 ('Jack Matthews', 0.6576),
 ('Toby Segaran', 0.5369),
 ('Michael Phillips', 0.2099)]

In [31]:
"""Clean Workspace"""
del user_user, user_stacked, user_sorted, user_clipped

-----------------------------------------------------------

### Test : User Similarity Dictionary

In [32]:
userA = 'Toby Segaran'
userSimilarity_euclidean(euclideanDict, userA)

[('Mick LaSalle', 0.168793),
 ('Jack Matthews', 0.165296),
 ('Lisa Rose', 0.159545),
 ('Gene Seymour', 0.155487),
 ('Michael Phillips', 0.154934),
 ('Claudia Puig', 0.149234)]

In [33]:
userA = 'Lisa Rose'
userSimilarity_pearson(pearsonDict, userA)

[('Mick LaSalle', 0.9006),
 ('Claudia Puig', 0.7575),
 ('Gene Seymour', 0.7023),
 ('Jack Matthews', 0.6576),
 ('Toby Segaran', 0.5369),
 ('Michael Phillips', 0.2099)]

------------------------------------------------------------------

## Part 2 : Recommend Items

#### Steps

1. Choose a test user 

2a. Select the row of the rating matrix corresponding to the test user

2b. Select columns (i.e. movies ) with cell value = 0.0. These are the movies that the test user have not seen

3. Use test user as the 'key' to find its 'value' from userSimilarity dictionary. The value is a list of (other user name, similary_distance) tuples where the tuples are sorted by similaity_distance

4. Get other users and their similarity distances from the sorted list in 3


5a. Select rows from the rating matrix corresponding to the user names. 
The selection of rows depends on similary_distance cutoffs. For example, for the closest 
3 users the new reduced rating matrix will have 3 rows; for the closest 10 users the new 
reduced rating matrix will have 10 rows, etc. This cutoff is used when creating the 
dictionary.

5b. Select columns from the rating matrix using 2a, 2b

6. Create a new dataframe using 5a,5b

7. Find max rating for each of the column (movie) as a series

8. Sort the series in descending order of ratings

9. Apply threshold to select movies with ratings >= threshold

10a. If there is no movies above the threshold, return none

10b. If there are movies above the threshold, return recommended movies and their ratings 
as a dictionary

### Method 1 : Recommend Movies using Rating Thresholds

#### Recommend Movies with Ratings Higher than 3.0

In [34]:
def recommendMovie(data, distanceDict, similarUser, testUser, threshold):
    """
    inputs
    data            : Utility matrix
    distanceDict    : Dictionary containing Euclidean or Pearson distances among users   
    similarUser     : Funtion to find a set of users similar to the test user
    testUser        : The user to whom movies whould be recommended to
    
    outputs
    sorted_rating   : movies with ratings >= threshold not watched by the test user
    """
    
    # Movies not watched by the test user
    not_watched_by_test_user = data.loc[testUser, data.loc[testUser, :] == 0]
    not_watched_by_test_user = not_watched_by_test_user.index.to_numpy()
    
    
    # Pre-computed similarity dictionary 
    # Distance between test user and other users as key,value pairs
    user_test_dist = similarUser(distanceDict, testUser)

    user_name = np.array( [item[0] for item in user_test_dist] )
    user_dist = np.array( [item[1] for item in user_test_dist] )


    # if test user have watched all movies in the utility matrix, return none
    if len(not_watched_by_test_user)==0:

        print(f'The test user : {test_user} : has watched all movies')
        print('No recommendation can be provided')
        return None
    
    # if test user have not watched all movies in the utility matrix, return recommendation
    else:
        # New dataframe with movies not watched by the test user
        theData  = data.loc[user_name, not_watched_by_test_user]
        
        # Find max rating for each of the movie as a series
        theSeries = theData.max(axis=0)
        
        # Sort the series in descending order of ratings
        sorted_rating = theSeries.sort_values(ascending=False)
        
        # Apply threshold to select movies with ratings >= threshold
        selected_movies = sorted_rating[ sorted_rating.values >= threshold ]
        
        # If there is no movies above the threshold 
        if len(selected_movies) == 0:
            # Return none
            print(f"Test user : {test_user}")
            print(f"No recommendation above the chosen threshold {threshold} can be provided")
            return None
        
        # If there are movies above the threshold 
        else:
            # Return recommended movies as a dictionary
            recommend_movies = selected_movies.to_dict()
            return recommend_movies

In [35]:
test_list = ['Claudia Puig','Mick LaSalle',
             'Lisa Rose','Michael Phillips',
             'Gene Seymour','Toby Segaran']

test_user = test_list[1]
threshold = 3
recommendMovie(utility, pearsonDict, userSimilarity_pearson, test_user, threshold)

Test user : Mick LaSalle
No recommendation above the chosen threshold 3 can be provided


## Part 2 : Recommend Items

### Method 2  : Recommend Movies based on Weighted Ratings 

#### Steps

1. Choose a test user 

2a. Select the row of the rating matrix corresponding to the test user

2b. For that row, select columns (i.e. movies ) with cell value = 0.0. These are the movies 
that the test user have not seen

3. Use test user as the 'key' to find its 'value' from userSimilarity dictionary. The value is a list of (other user name, similary_distance) tuples where the tuples are sorted by similaity_distance

4. Get other users and their similarity distances from the sorted list in 3

5a. Select rows from the rating matrix corresponding to other users. 
The selection of rows depends on similary_distance cutoffs. For example, for the closest 
3 users the new reduced rating matrix will have 3 rows; for the closest 10 users the 
new reduced rating matrix will have 10 rows, etc. This cutoff is used when creating the dictionary.

5b. Select columns from the rating matrix using 2a, 2b

6. Create a new dataframe using 5a,5b

7. Add a new column to dataframe in 6 and name it as 'weights'. The values of this column are user similarity distances from 4b

8. Calculate weighted ratings of movies not watched by the test user. This is known as the 'neighborhood based' rating prediction of movies 

9. Sort weighted ratings

10. Return movie and rating as a dictionary

In [36]:
def recommendMovie(data, distanceDict, similarUser, testUser, totalRecommend):
    """
    inputs
    data            : Rating matrix
    distanceDict    : Dictionary containing Euclidean or Pearson distances among users   
    similarUser     : Funtion to find a set of users similar to the test user
    testUser        : The user to whom movies whould be recommended to
    
    outputs
    sorted_weighted_rating : weighted ratings of the movies not watched by the test user
    """
    
    # testUser : It is the label of the index. Not the positon of the index
    # Movies not watched by the test user
    not_watched_movie = data.loc[ testUser, data.loc[testUser, :]==0 ]
    not_watched_list = not_watched_movie.index.to_numpy()
    
    
    # Pre-computed similarity dictionary 
    # Distance between test user and other users as key,value pairs
    user_test_dist = similarUser(distanceDict, testUser)

    user_name = np.array( [item[0] for item in user_test_dist] )
    user_dist = np.array( [item[1] for item in user_test_dist] )


    # if test user have watched all movies, return none
    if len(not_watched_list)==0:

        print(f'The test user : {test_user} : has watched all movies')
        print('No recommendation can be provided')
        return None
    
    # if test user has not watched all movies, return recommendation
    else:
        # New dataframe with movies not watched by the test user
        theData = data.loc[user_name, not_watched_list]

        # Add similarity distances as 'weights' to the dataframe  
        theData['weights'] = user_dist

        # Calculate weighted ratings of the movies 
        weighted_rating = theData.apply(lambda row : row * row['weights'], axis=1)
        weighted_rating = weighted_rating.apply(lambda col : sum(col), axis=0)
        weighted_rating = weighted_rating/np.sum( theData['weights'] )
        
        # Sort weighted ratings in descending order
        sorted_weighted_rating = weighted_rating.sort_values( ascending=False)
        
        # Prettify rating values 
        sorted_weighted_rating = sorted_weighted_rating.apply(lambda val : round(val,2))
        
        # Discard the weight column. No longer needed
        sorted_weighted_rating = sorted_weighted_rating.drop( index=['weights'], inplace=False)
        
        # Limit the number of recommended movies
        sorted_weighted_rating = sorted_weighted_rating[0:totalRecommend]
        
        # Return recommended movies as a dictionary
        recommend_movies = sorted_weighted_rating.to_dict()
        
        return recommend_movies

In [37]:
test_list = ['Claudia Puig','Mick LaSalle',
             'Lisa Rose','Michael Phillips',
             'Gene Seymour','Toby Segaran']
test_user = test_list[1]

recommendMovie(utility, pearsonDict, userSimilarity_pearson, test_user, 5)

{'Lulu and the River': 0.41}