# Collaborative user-user filtering

Imports

In [1]:
import numpy as np
import pandas as pd

The original file contains
- 1,019,318 unique users
- 48,373,586 user-song.play count triplets

A subset of 50000 triplets can be found in triplets_50000.txt, where each line is in the format:
    
    userID \tab songID \tab play_count

Read in the data:

In [2]:
user_profiles = pd.read_csv('data/triplets_50000.txt', sep='\t', names = ['userID','songID', 'play_count'])

The problem: the original dataset of triplets is too large to be converted in this way.
Possible solutions:
1. dtype optimization
2. Split data into chunks

In [10]:
user_profiles

Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
995,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYPJMP12AF72A901D,1
996,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYRHNG12A8C14002E,1
997,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYVSHP12A6702016E,2
998,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYYYFE12A81C2395E,1


Pivot to tranform the data from long to wide:

In [3]:
user_profiles = user_profiles.pivot(index='userID', columns='songID', values='play_count')

In [7]:
user_profiles

songID,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACTC12AB0186A20,SOAADCB12A81C22AFA,SOAAEJI12AB0188AB5,SOAAEKX12A6D4F7E4E,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAIJG12AAA15D821,SOAAIWE12A8AE4706B,...,SOZZVFP12A8C140F14,SOZZVMW12AB0183B52,SOZZVNT12AF729EBC9,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZXAO12A58A7D379,SOZZYAO12A6701FF36,SOZZYDA12AB01824FB,SOZZYMH12AB0180A51,SOZZZFB12A8AE45CDC
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,,,,,,,,,,,...,,,,,,,,,,
0039bd8483d578997718cdc0bf6c7c88b679f488,,,,,,,,,,,...,,,,,,,,,,
00498f4bab2bfeb17680113c7d9525ad5b0ad401,,,,,,,,,,,...,,,,,,,,,,
00a443baf550f4bbdd974ba73720abf2759166f3,,,,,,,,,,,...,,,,,,,,,,
0152fcbd02b172a874c75a57a913f0f0109ba272,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe979a7b199de3ee8a78486c10e5ed13587fc359,,,,,,,,,,,...,,,,,,,,,,
fed37c4c49c9f217b3371c2f2c0e7541656e55cf,,,,,,,,,,,...,,,,,,,,,,
ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,,,,,,,,,,,...,,,,,,,,,,
ff4322e94814d3c7895d07e6f94139b092862611,,,,,,,,,,,...,,,,,,,,,,


Drop the columns where all elements are NaN

In [8]:
user_profiles = user_profiles.dropna(axis=1, how='all') #doesn't make sense, a song will only exist if a user has listend to it
user_profiles

songID,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACTC12AB0186A20,SOAADCB12A81C22AFA,SOAAEJI12AB0188AB5,SOAAEKX12A6D4F7E4E,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAIJG12AAA15D821,SOAAIWE12A8AE4706B,...,SOZZVFP12A8C140F14,SOZZVMW12AB0183B52,SOZZVNT12AF729EBC9,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZXAO12A58A7D379,SOZZYAO12A6701FF36,SOZZYDA12AB01824FB,SOZZYMH12AB0180A51,SOZZZFB12A8AE45CDC
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,,,,,,,,,,,...,,,,,,,,,,
0039bd8483d578997718cdc0bf6c7c88b679f488,,,,,,,,,,,...,,,,,,,,,,
00498f4bab2bfeb17680113c7d9525ad5b0ad401,,,,,,,,,,,...,,,,,,,,,,
00a443baf550f4bbdd974ba73720abf2759166f3,,,,,,,,,,,...,,,,,,,,,,
0152fcbd02b172a874c75a57a913f0f0109ba272,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe979a7b199de3ee8a78486c10e5ed13587fc359,,,,,,,,,,,...,,,,,,,,,,
fed37c4c49c9f217b3371c2f2c0e7541656e55cf,,,,,,,,,,,...,,,,,,,,,,
ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,,,,,,,,,,,...,,,,,,,,,,
ff4322e94814d3c7895d07e6f94139b092862611,,,,,,,,,,,...,,,,,,,,,,


In [4]:
# Replace the NaN with 0s.
user_profiles = user_profiles.fillna(0)

In [8]:
#save it as a csv (do it only once)
#user_profiles.to_csv(path_or_buf= 'user_profile_from_50000_triplets.csv')

Get **cosine similarity** for play counts between users

In [5]:
# pairwise_distances is the distance between counts, thus 1 - pairwise_distances is the similarity between counts
from sklearn.metrics import pairwise_distances

cosine_sim = 1-pairwise_distances(user_profiles , metric="cosine")

In [20]:
# Calculate the cosine similarity matrix for the users
M_cosine = pd.DataFrame(cosine_sim)
M_cosine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,0.00046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.026845,0.0,0.0,0.0,0.0,0.0,0.007645,0.0,0.0,0.0,0.0
2,0.00046,0.026845,1.0,0.0,0.039402,0.002493,0.004943,0.064458,0.034851,0.0,0.13721,0.000173,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.039402,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.002493,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.013717,0.0
6,0.0,0.0,0.004943,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.064458,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.007645,0.034851,0.0,0.0,0.0,0.0,0.0,1.0,0.003045,0.0,0.002877,0.009309
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003045,1.0,0.0,0.0,0.008786


Get **pearson similarity** for all users

In [21]:
pearson_sim = 1-pairwise_distances(user_profiles, metric="correlation")
M_pearson = pd.DataFrame(pearson_sim)
M_pearson

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,-0.033196,-0.15966,-0.044875,-0.032843,-0.039742,-0.029074,-0.032717,-0.10091,-0.081124,-0.032498,-0.06292,-0.04154
1,-0.033196,1.0,-0.022217,-0.014686,-0.010749,-0.013006,-0.009515,-0.010707,-0.024962,-0.02655,-0.010636,-0.020592,-0.013595
2,-0.15966,-0.022217,1.0,-0.070875,-0.007531,-0.059955,-0.040363,0.02086,-0.118452,-0.128125,0.10306,-0.099177,-0.065607
3,-0.044875,-0.014686,-0.070875,1.0,-0.01453,-0.017582,-0.012863,-0.014474,-0.044644,-0.03589,-0.014378,-0.027836,-0.018378
4,-0.032843,-0.010749,-0.007531,-0.01453,1.0,-0.012868,-0.009414,-0.010593,-0.032674,-0.026267,-0.010523,-0.020373,-0.01345
5,-0.039742,-0.013006,-0.059955,-0.017582,-0.012868,1.0,-0.011391,-0.012819,-0.039537,-0.031785,-0.012733,-0.010561,-0.016275
6,-0.029074,-0.009515,-0.040363,-0.012863,-0.009414,-0.011391,1.0,-0.009378,-0.028924,-0.023253,-0.009315,-0.018035,-0.011907
7,-0.032717,-0.010707,0.02086,-0.014474,-0.010593,-0.012819,-0.009378,1.0,-0.032549,-0.026166,-0.010482,-0.020295,-0.013399
8,-0.10091,-0.024962,-0.118452,-0.044644,-0.032674,-0.039537,-0.028924,-0.032549,1.0,-0.07741,-0.032331,-0.059519,-0.031477
9,-0.081124,-0.02655,-0.128125,-0.03589,-0.026267,-0.031785,-0.023253,-0.026166,-0.07741,1.0,-0.025991,-0.050322,-0.024079


Same for euclidean and hamming :

In [24]:
euclidean_sim = 1-pairwise_distances(user_profiles, metric="euclidean")
M_euclidean = pd.DataFrame(euclidean_sim)

hamming_sim = 1-pairwise_distances(user_profiles, metric="hamming")
M_hamming = pd.DataFrame(hamming_sim)



## Find k similar users to a given user

A function that finds k similar users given userID and the user_profiles matrix

In [5]:
from sklearn.neighbors import NearestNeighbors

def get_similarusers(userID, user_profiles, similarity_metric , k):
    '''Find k most similar users to a given userID'''
    similarity = list()
    neigh_ind = list()
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(user_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(user_profiles.loc[userID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar users to user {}, using {} similarity:\n'.format(k, userID, similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        if user_profiles.index[neigh_ind.flatten()[i]] == userID:
            continue;
        else:
            print('{}: User {}, with similarity of {}'.format(i, user_profiles.index[neigh_ind.flatten()[i]], similarity.flatten()[i]))
            
    return similarity,neigh_ind

In [6]:
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'cosine', k = 4)

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using cosine similarity:

1: User 5696bf760215a24ba6f381b8e466828131ddf13b, with similarity of 0.24960454854408132
2: User cbf471272db7a183cf596a4cb79aa27e7d72dda1, with similarity of 0.18669122855068843
3: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.13388321848534124
4: User b98490b4e714cbdf98f4e819c150fc6eff83cb28, with similarity of 0.12962980922945277


In [8]:
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'correlation', k = 4)

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using correlation similarity:

1: User 5696bf760215a24ba6f381b8e466828131ddf13b, with similarity of 0.24690946133746028
2: User cbf471272db7a183cf596a4cb79aa27e7d72dda1, with similarity of 0.1855051128720917
3: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.13279425369885756
4: User b98490b4e714cbdf98f4e819c150fc6eff83cb28, with similarity of 0.1290498311209165


## Predict play count for a user-song combination based on user-user

In [7]:
def predict_play_count_uu(userID, songID, user_profiles, similarity_metric, k):
    '''Predict play count for a particular user-song tuple, based on user-to-user similarity. Use with cosine similarity.'''
    prediction = 0
    similarity, indices = get_similarusers(userID, user_profiles, similarity_metric, k) #similar users based on cosine similarity
    # get mean play count for a user, to adjust
    mean_play_count = user_profiles.loc[userID, :].mean() 
    # weight_i is the similarity of neigbhor_i to user X
    sum_of_similarity = np.sum(similarity) - 1 # -1 because user 1 is included, has a similarity of 1
    
    # initializing variables
    w_similarity = 1
    weighted_sum = 0
    
    for i in range(0, len(indices.flatten())):
        if user_profiles.index[indices.flatten()[i]] == userID:
            continue;
        else:
            # Normalize ratings for a given user by subtracting row mean (centered cosine, or pearson cor)
            play_count_dif = user_profiles.iloc[indices.flatten()[i],user_profiles.columns.get_loc(songID)] - np.mean(user_profiles.iloc[indices.flatten()[i],:])
            w_similarity = play_count_dif*similarity[i]
            weighted_sum += w_similarity
            
    prediction = mean_play_count + (weighted_sum/sum_of_similarity)
    print('Predicted rating for user {} -> song {}: {}'.format(userID, songID, prediction))

    return prediction

In [8]:
predict_play_count_uu('5a905f000fc1ff3df7ca807d57edb608863db05d', 'SOZZYAO12A6701FF36', user_profiles, 'cosine', 4 )

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using cosine similarity:

1: User 5696bf760215a24ba6f381b8e466828131ddf13b, with similarity of 0.24960454854408132
2: User cbf471272db7a183cf596a4cb79aa27e7d72dda1, with similarity of 0.18669122855068843
3: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.13388321848534124
4: User b98490b4e714cbdf98f4e819c150fc6eff83cb28, with similarity of 0.12962980922945277
Predicted rating for user 5a905f000fc1ff3df7ca807d57edb608863db05d -> song SOZZYAO12A6701FF36: 0.046185464411638434


0.046185464411638434

In [58]:
# filter out low play songs to get better results?
sum_col = user_profiles.sum(axis = 0)

In [66]:
import statistics

In [67]:
print(max(sum_col))
print(min(sum_col))
print(statistics.median(sum_col))
print(statistics.mean(sum_col))


1890.0
1.0
2.0
5.197863646139508


In [76]:
# how many columns have a sum of 2 or less play counts
ignore_indexes, count= [], 0
for i, value in enumerate(sum_col):
    if value <= 2:
        count += 1
        ignore_indexes.append(i)
print(count)
print(len(ignore_indexes))

16601
16601


In [74]:
user_profiles.shape

(1010, 27898)

In [77]:
# drop columns based on index list created above
df2 = user_profiles.drop(user_profiles.iloc[:, ignore_indexes], axis = 1)

In [78]:
df2.shape

(1010, 11297)

In [79]:
27898 - 16601

11297

In [82]:
df2

songID,SOAAAGQ12A8C1420C8,SOAADCB12A81C22AFA,SOAAFAC12A67ADF7EB,SOAAKFY12A6D4F7B03,SOAAKPM12A58A77210,SOAAOYI12AB01831CE,SOAAPTP12A8AE46F86,SOAAROC12A6D4FA420,SOAATLI12A8C13E319,SOAAVUV12AB0186646,...,SOZZISR12A6D4F9391,SOZZKMI12AB017F208,SOZZLZN12A8AE48D6D,SOZZNDF12A8C143567,SOZZPSS12A6D4F3C71,SOZZRHE12A6702165F,SOZZTCU12AB0182C58,SOZZTNF12A8C139916,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0039bd8483d578997718cdc0bf6c7c88b679f488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00498f4bab2bfeb17680113c7d9525ad5b0ad401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00a443baf550f4bbdd974ba73720abf2759166f3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0152fcbd02b172a874c75a57a913f0f0109ba272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe979a7b199de3ee8a78486c10e5ed13587fc359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fed37c4c49c9f217b3371c2f2c0e7541656e55cf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ff4322e94814d3c7895d07e6f94139b092862611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:

# now repeat prediction for user ff4322e94814d3c7895d07e6f94139b092862611 and song SOAADCB12A81C22AFA
predict_play_count_uu('00a443baf550f4bbdd974ba73720abf2759166f3', 'SOAADCB12A81C22AFA', df2, 'cosine', 10 )

10 most similar users to user 00a443baf550f4bbdd974ba73720abf2759166f3, using cosine similarity:

1: User 176b5e98c1f01dba146ce8c6e15cf6fc344485ec, with similarity of 0.3153445573487389
2: User 07caa920795cd4f20bfeeb0e192a5ddd9566ecdd, with similarity of 0.3000190494332986
3: User 38c11af0c42bb21cf5b9ffb535f76c7967241b52, with similarity of 0.21962073608048815
4: User 49917cc2a92adfa6e6637bcd9e7f6756bbe51be1, with similarity of 0.19676150771782186
5: User 4e32e7efbea85d6f73a1cad6abb9a0863973d433, with similarity of 0.19579893574797724
6: User 878bdaeb36c5de29e4b9224332c93dde3e1c6e5d, with similarity of 0.19071273661496624
7: User dc777dbe9ced6ddb5b813e7e0bbb718171b87870, with similarity of 0.1542560241214721
8: User de4cec62bfb6a7be09ba1123bf03c315377be7fd, with similarity of 0.1542069601954823
9: User 9fbc0cc4fe6191cabdddf41124bd507dca08ceb6, with similarity of 0.14771764147848476
10: User bb84b605789d898993e2c6fbda4d57a8bc8da369, with similarity of 0.14445748309144124
Predicted ratin

0