# Collaborative user-user filtering

Imports

In [1]:
import numpy as np
import pandas as pd

The original file contains
- 1,019,318 unique users
- 48,373,586 user-song.play count triplets

A subset of 1000 triplets can be found in triplets_1000.txt, where each line is in the format:
    
    userID \tab songID \tab play_count

Read in the data:

In [9]:
user_profiles = pd.read_csv('triplets_1000.txt', sep='\t', names = ['userID','songID', 'play_count'])

The problem: the original dataset of triplets is too large to be converted in this way.
Possible solutions:
1. dtype optimization
2. Split data into chunks

In [10]:
user_profiles

Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
995,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYPJMP12AF72A901D,1
996,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYRHNG12A8C14002E,1
997,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYVSHP12A6702016E,2
998,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYYYFE12A81C2395E,1


Pivot to tranform the data from long to wide:

In [10]:
user_profiles = user_profiles.pivot(index='userID', columns='songID', values='play_count')

In [11]:
user_profiles

songID,SOAARXR12A8C133D15,SOABRAB12A6D4F7AAF,SOACPBY12A8C13FEF9,SOACWYB12AF729E581,SOADGFH12A8C143D89,SOADQPP12A67020C82,SOAFOBL12AF72A25BA,SOAFPAX12AB0187A17,SOAFTRR12AF72A8D4D,SOAIILB12A58A776F7,...,SOZMJFG12AB017BDAF,SOZMNAX12A58A77F88,SOZNBQP12A6310D8AA,SOZOBWN12A8C130999,SOZPQES12A6D4F8E57,SOZRBOZ12A58A7AD7E,SOZRLJL12A8C14415F,SOZVCRW12A67ADA0B7,SOZWVEH12A6D4F7C37,SOZZHXI12A8C13BF7D
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17aa9f6dbdf753831da8f38c71b66b64373de613,1.0,,1.0,,,,,,,,...,,,,,,,1.0,,,
4bd88bfb25263a75bbdd467e74018f4ae570e5df,,,,,,,,,,,...,,,,,,,,,,
5a905f000fc1ff3df7ca807d57edb608863db05d,,,,,11.0,,12.0,,1.0,3.0,...,,,,,,,,,,
85c1f87fea955d09b4bec2e36aee110927aedf9a,,,,2.0,,,,,,,...,,,,,,,,,,
8937134734f869debcab8f23d77465b4caaa85df,,,,,,,,6.0,,,...,,,,,,,,,,
969cc6fb74e076a68e36a04409cb9d3765757508,,2.0,,,,,,,,,...,,,,,,1.0,,,1.0,
9bb911319fbc04f01755814cb5edb21df3d1a336,,,,,,,,,,,...,,,,,,,,,,
9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,,,,,,,,,,,...,,,,,,,,,,
b64cdd1a0bd907e5e00b39e345194768e330d652,,,,,,,,,3.0,,...,,2.0,,,2.0,,,,,
b80344d063b5ccb3212f76538f3d9e43d87dca9e,,,,,,,,,,,...,,,,1.0,,,,,,1.0


In [8]:
#save it as a csv (do it only once)
#user_profiles.to_csv(path_or_buf= 'data/user_profile_from_1000_triplets.csv')

Drop the columns where all elements are NaN

In [13]:
user_profiles = user_profiles.dropna(axis=1, how='all')
user_profiles

songID,SOAARXR12A8C133D15,SOABRAB12A6D4F7AAF,SOACPBY12A8C13FEF9,SOACWYB12AF729E581,SOADGFH12A8C143D89,SOADQPP12A67020C82,SOAFOBL12AF72A25BA,SOAFPAX12AB0187A17,SOAFTRR12AF72A8D4D,SOAIILB12A58A776F7,...,SOZMJFG12AB017BDAF,SOZMNAX12A58A77F88,SOZNBQP12A6310D8AA,SOZOBWN12A8C130999,SOZPQES12A6D4F8E57,SOZRBOZ12A58A7AD7E,SOZRLJL12A8C14415F,SOZVCRW12A67ADA0B7,SOZWVEH12A6D4F7C37,SOZZHXI12A8C13BF7D
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17aa9f6dbdf753831da8f38c71b66b64373de613,1.0,,1.0,,,,,,,,...,,,,,,,1.0,,,
4bd88bfb25263a75bbdd467e74018f4ae570e5df,,,,,,,,,,,...,,,,,,,,,,
5a905f000fc1ff3df7ca807d57edb608863db05d,,,,,11.0,,12.0,,1.0,3.0,...,,,,,,,,,,
85c1f87fea955d09b4bec2e36aee110927aedf9a,,,,2.0,,,,,,,...,,,,,,,,,,
8937134734f869debcab8f23d77465b4caaa85df,,,,,,,,6.0,,,...,,,,,,,,,,
969cc6fb74e076a68e36a04409cb9d3765757508,,2.0,,,,,,,,,...,,,,,,1.0,,,1.0,
9bb911319fbc04f01755814cb5edb21df3d1a336,,,,,,,,,,,...,,,,,,,,,,
9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,,,,,,,,,,,...,,,,,,,,,,
b64cdd1a0bd907e5e00b39e345194768e330d652,,,,,,,,,3.0,,...,,2.0,,,2.0,,,,,
b80344d063b5ccb3212f76538f3d9e43d87dca9e,,,,,,,,,,,...,,,,1.0,,,,,,1.0


In [15]:
# Replace the NaN with 0s.
user_profiles = user_profiles.fillna(0)

Get **cosine similarity** for play counts between users

In [16]:
# pairwise_distances is the distance between counts, thus 1 - pairwise_distances is the similarity between counts
from sklearn.metrics import pairwise_distances

cosine_sim = 1-pairwise_distances(user_profiles , metric="cosine")

In [20]:
# Calculate the cosine similarity matrix for the users
M_cosine = pd.DataFrame(cosine_sim)
M_cosine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,0.00046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.026845,0.0,0.0,0.0,0.0,0.0,0.007645,0.0,0.0,0.0,0.0
2,0.00046,0.026845,1.0,0.0,0.039402,0.002493,0.004943,0.064458,0.034851,0.0,0.13721,0.000173,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.039402,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.002493,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.013717,0.0
6,0.0,0.0,0.004943,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.064458,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.007645,0.034851,0.0,0.0,0.0,0.0,0.0,1.0,0.003045,0.0,0.002877,0.009309
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003045,1.0,0.0,0.0,0.008786


Get **pearson similarity** for all users

In [21]:
pearson_sim = 1-pairwise_distances(user_profiles, metric="correlation")
M_pearson = pd.DataFrame(pearson_sim)
M_pearson

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,-0.033196,-0.15966,-0.044875,-0.032843,-0.039742,-0.029074,-0.032717,-0.10091,-0.081124,-0.032498,-0.06292,-0.04154
1,-0.033196,1.0,-0.022217,-0.014686,-0.010749,-0.013006,-0.009515,-0.010707,-0.024962,-0.02655,-0.010636,-0.020592,-0.013595
2,-0.15966,-0.022217,1.0,-0.070875,-0.007531,-0.059955,-0.040363,0.02086,-0.118452,-0.128125,0.10306,-0.099177,-0.065607
3,-0.044875,-0.014686,-0.070875,1.0,-0.01453,-0.017582,-0.012863,-0.014474,-0.044644,-0.03589,-0.014378,-0.027836,-0.018378
4,-0.032843,-0.010749,-0.007531,-0.01453,1.0,-0.012868,-0.009414,-0.010593,-0.032674,-0.026267,-0.010523,-0.020373,-0.01345
5,-0.039742,-0.013006,-0.059955,-0.017582,-0.012868,1.0,-0.011391,-0.012819,-0.039537,-0.031785,-0.012733,-0.010561,-0.016275
6,-0.029074,-0.009515,-0.040363,-0.012863,-0.009414,-0.011391,1.0,-0.009378,-0.028924,-0.023253,-0.009315,-0.018035,-0.011907
7,-0.032717,-0.010707,0.02086,-0.014474,-0.010593,-0.012819,-0.009378,1.0,-0.032549,-0.026166,-0.010482,-0.020295,-0.013399
8,-0.10091,-0.024962,-0.118452,-0.044644,-0.032674,-0.039537,-0.028924,-0.032549,1.0,-0.07741,-0.032331,-0.059519,-0.031477
9,-0.081124,-0.02655,-0.128125,-0.03589,-0.026267,-0.031785,-0.023253,-0.026166,-0.07741,1.0,-0.025991,-0.050322,-0.024079


Same for euclidean and hamming :

In [24]:
euclidean_sim = 1-pairwise_distances(user_profiles, metric="euclidean")
M_euclidean = pd.DataFrame(euclidean_sim)

hamming_sim = 1-pairwise_distances(user_profiles, metric="hamming")
M_hamming = pd.DataFrame(hamming_sim)



A function that finds k similar users given userID and the user_profiles matrix

In [90]:
from sklearn.neighbors import NearestNeighbors

def get_similarusers(userID, user_profiles, similarity_metric , k):
    '''Find k most similar users to a given userID'''
    similarity = list()
    neigh_ind = list()
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(user_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(user_profiles.loc[userID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar users to user {}, using {} similarity:\n'.format(k, userID, similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        if user_profiles.index[neigh_ind.flatten()[i]] == userID:
            continue;
        else:
            print('{}: User {}, with similarity of {}'.format(i, user_profiles.index[indices.flatten()[i]], similarity.flatten()[i]))
            
    return similarity,neigh_ind

In [93]:
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'cosine', k = 4)

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using cosine similarity:

1: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.1372096300756751
2: User 9d6f0ead607ac2a6c2460e4d14fb439a146b7dec, with similarity of 0.06445781527405536
3: User 8937134734f869debcab8f23d77465b4caaa85df, with similarity of 0.03940217146290359
4: User 4bd88bfb25263a75bbdd467e74018f4ae570e5df, with similarity of 0.03485128089051981


In [96]:
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'correlation', k = 4)

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using correlation similarity:

1: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.10306040261066751
2: User 9d6f0ead607ac2a6c2460e4d14fb439a146b7dec, with similarity of 0.02085982519104146
3: User 85c1f87fea955d09b4bec2e36aee110927aedf9a, with similarity of -0.007531103718259935
4: User b64cdd1a0bd907e5e00b39e345194768e330d652, with similarity of -0.022216901663766775
