# Collaborative user-user filtering
Inspired by https://github.com/csaluja/JupyterNotebooks-Medium/blob/master/CF%20Recommendation%20System-Examples.ipynb

Imports

In [1]:
import numpy as np
import pandas as pd

The original file contains
- 1,019,318 unique users
- 48,373,586 user-song.play count triplets

A subset of 50000 triplets can be found in triplets_50000.txt, where each line is in the format:
    
    userID \tab songID \tab play_count

Read in the data:

In [2]:
user_profiles = pd.read_csv('data/triplets_50000.txt', sep='\t', names = ['userID','songID', 'play_count'])

The problem: the original dataset of triplets is too large to be converted in this way.
Possible solutions:
1. dtype optimization
2. Split data into chunks

In [10]:
user_profiles

Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
995,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYPJMP12AF72A901D,1
996,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYRHNG12A8C14002E,1
997,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYVSHP12A6702016E,2
998,5a905f000fc1ff3df7ca807d57edb608863db05d,SOYYYFE12A81C2395E,1


Pivot to tranform the data from long to wide:

In [3]:
user_profiles = user_profiles.pivot(index='userID', columns='songID', values='play_count')

In [7]:
user_profiles

songID,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACTC12AB0186A20,SOAADCB12A81C22AFA,SOAAEJI12AB0188AB5,SOAAEKX12A6D4F7E4E,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAIJG12AAA15D821,SOAAIWE12A8AE4706B,...,SOZZVFP12A8C140F14,SOZZVMW12AB0183B52,SOZZVNT12AF729EBC9,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZXAO12A58A7D379,SOZZYAO12A6701FF36,SOZZYDA12AB01824FB,SOZZYMH12AB0180A51,SOZZZFB12A8AE45CDC
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,,,,,,,,,,,...,,,,,,,,,,
0039bd8483d578997718cdc0bf6c7c88b679f488,,,,,,,,,,,...,,,,,,,,,,
00498f4bab2bfeb17680113c7d9525ad5b0ad401,,,,,,,,,,,...,,,,,,,,,,
00a443baf550f4bbdd974ba73720abf2759166f3,,,,,,,,,,,...,,,,,,,,,,
0152fcbd02b172a874c75a57a913f0f0109ba272,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe979a7b199de3ee8a78486c10e5ed13587fc359,,,,,,,,,,,...,,,,,,,,,,
fed37c4c49c9f217b3371c2f2c0e7541656e55cf,,,,,,,,,,,...,,,,,,,,,,
ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,,,,,,,,,,,...,,,,,,,,,,
ff4322e94814d3c7895d07e6f94139b092862611,,,,,,,,,,,...,,,,,,,,,,


Drop the columns where all elements are NaN

In [8]:
user_profiles = user_profiles.dropna(axis=1, how='all') #doesn't make sense, a song will only exist if a user has listend to it
user_profiles

songID,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACTC12AB0186A20,SOAADCB12A81C22AFA,SOAAEJI12AB0188AB5,SOAAEKX12A6D4F7E4E,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAIJG12AAA15D821,SOAAIWE12A8AE4706B,...,SOZZVFP12A8C140F14,SOZZVMW12AB0183B52,SOZZVNT12AF729EBC9,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZXAO12A58A7D379,SOZZYAO12A6701FF36,SOZZYDA12AB01824FB,SOZZYMH12AB0180A51,SOZZZFB12A8AE45CDC
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,,,,,,,,,,,...,,,,,,,,,,
0039bd8483d578997718cdc0bf6c7c88b679f488,,,,,,,,,,,...,,,,,,,,,,
00498f4bab2bfeb17680113c7d9525ad5b0ad401,,,,,,,,,,,...,,,,,,,,,,
00a443baf550f4bbdd974ba73720abf2759166f3,,,,,,,,,,,...,,,,,,,,,,
0152fcbd02b172a874c75a57a913f0f0109ba272,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe979a7b199de3ee8a78486c10e5ed13587fc359,,,,,,,,,,,...,,,,,,,,,,
fed37c4c49c9f217b3371c2f2c0e7541656e55cf,,,,,,,,,,,...,,,,,,,,,,
ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,,,,,,,,,,,...,,,,,,,,,,
ff4322e94814d3c7895d07e6f94139b092862611,,,,,,,,,,,...,,,,,,,,,,


In [4]:
# Replace the NaN with 0s.
user_profiles = user_profiles.fillna(0)

In [8]:
#save it as a csv (do it only once)
#user_profiles.to_csv(path_or_buf= 'user_profile_from_50000_triplets.csv')

Get **cosine similarity** for play counts between users

In [5]:
# pairwise_distances is the distance between counts, thus 1 - pairwise_distances is the similarity between counts
from sklearn.metrics import pairwise_distances

cosine_sim = 1-pairwise_distances(user_profiles , metric="cosine")

In [20]:
# Calculate the cosine similarity matrix for the users
M_cosine = pd.DataFrame(cosine_sim)
M_cosine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,0.00046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.026845,0.0,0.0,0.0,0.0,0.0,0.007645,0.0,0.0,0.0,0.0
2,0.00046,0.026845,1.0,0.0,0.039402,0.002493,0.004943,0.064458,0.034851,0.0,0.13721,0.000173,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.039402,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.002493,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.013717,0.0
6,0.0,0.0,0.004943,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.064458,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.007645,0.034851,0.0,0.0,0.0,0.0,0.0,1.0,0.003045,0.0,0.002877,0.009309
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003045,1.0,0.0,0.0,0.008786


Get **pearson similarity** for all users

In [21]:
pearson_sim = 1-pairwise_distances(user_profiles, metric="correlation")
M_pearson = pd.DataFrame(pearson_sim)
M_pearson

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,-0.033196,-0.15966,-0.044875,-0.032843,-0.039742,-0.029074,-0.032717,-0.10091,-0.081124,-0.032498,-0.06292,-0.04154
1,-0.033196,1.0,-0.022217,-0.014686,-0.010749,-0.013006,-0.009515,-0.010707,-0.024962,-0.02655,-0.010636,-0.020592,-0.013595
2,-0.15966,-0.022217,1.0,-0.070875,-0.007531,-0.059955,-0.040363,0.02086,-0.118452,-0.128125,0.10306,-0.099177,-0.065607
3,-0.044875,-0.014686,-0.070875,1.0,-0.01453,-0.017582,-0.012863,-0.014474,-0.044644,-0.03589,-0.014378,-0.027836,-0.018378
4,-0.032843,-0.010749,-0.007531,-0.01453,1.0,-0.012868,-0.009414,-0.010593,-0.032674,-0.026267,-0.010523,-0.020373,-0.01345
5,-0.039742,-0.013006,-0.059955,-0.017582,-0.012868,1.0,-0.011391,-0.012819,-0.039537,-0.031785,-0.012733,-0.010561,-0.016275
6,-0.029074,-0.009515,-0.040363,-0.012863,-0.009414,-0.011391,1.0,-0.009378,-0.028924,-0.023253,-0.009315,-0.018035,-0.011907
7,-0.032717,-0.010707,0.02086,-0.014474,-0.010593,-0.012819,-0.009378,1.0,-0.032549,-0.026166,-0.010482,-0.020295,-0.013399
8,-0.10091,-0.024962,-0.118452,-0.044644,-0.032674,-0.039537,-0.028924,-0.032549,1.0,-0.07741,-0.032331,-0.059519,-0.031477
9,-0.081124,-0.02655,-0.128125,-0.03589,-0.026267,-0.031785,-0.023253,-0.026166,-0.07741,1.0,-0.025991,-0.050322,-0.024079


Same for euclidean and hamming :

In [24]:
euclidean_sim = 1-pairwise_distances(user_profiles, metric="euclidean")
M_euclidean = pd.DataFrame(euclidean_sim)

hamming_sim = 1-pairwise_distances(user_profiles, metric="hamming")
M_hamming = pd.DataFrame(hamming_sim)



## Find k similar users to a given user

A function that finds k similar users given userID and the user_profiles matrix

In [5]:
from sklearn.neighbors import NearestNeighbors

def get_similarusers(userID, user_profiles, similarity_metric , k):
    '''Find k most similar users to a given userID'''
    similarity = list()
    neigh_ind = list()
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(user_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(user_profiles.loc[userID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar users to user {}, using {} similarity:\n'.format(k, userID, similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        if user_profiles.index[neigh_ind.flatten()[i]] == userID:
            continue;
        else:
            print('{}: User {}, with similarity of {}'.format(i, user_profiles.index[neigh_ind.flatten()[i]], similarity.flatten()[i]))
            
    return similarity,neigh_ind

In [6]:
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'cosine', k = 4)

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using cosine similarity:

1: User 5696bf760215a24ba6f381b8e466828131ddf13b, with similarity of 0.24960454854408132
2: User cbf471272db7a183cf596a4cb79aa27e7d72dda1, with similarity of 0.18669122855068843
3: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.13388321848534124
4: User b98490b4e714cbdf98f4e819c150fc6eff83cb28, with similarity of 0.12962980922945277


In [8]:
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'correlation', k = 4)

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using correlation similarity:

1: User 5696bf760215a24ba6f381b8e466828131ddf13b, with similarity of 0.24690946133746028
2: User cbf471272db7a183cf596a4cb79aa27e7d72dda1, with similarity of 0.1855051128720917
3: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.13279425369885756
4: User b98490b4e714cbdf98f4e819c150fc6eff83cb28, with similarity of 0.1290498311209165


## Predict play count for a user-song combination based on user-user

In [7]:
def predict_play_count_uu(userID, songID, user_profiles, similarity_metric, k):
    '''Predict play count for a particular user-song tuple, based on user-to-user similarity. Use with cosine similarity.'''
    prediction = 0
    similarity, indices = get_similarusers(userID, user_profiles, similarity_metric, k) #similar users based on cosine similarity
    # get mean play count for a user, to adjust
    mean_play_count = user_profiles.loc[userID, :].mean() 
    # weight_i is the similarity of neigbhor_i to user X
    sum_of_similarity = np.sum(similarity) - 1 # -1 because user 1 is included, has a similarity of 1
    
    # initializing variables
    w_similarity = 1
    weighted_sum = 0
    
    for i in range(0, len(indices.flatten())):
        if user_profiles.index[indices.flatten()[i]] == userID:
            continue;
        else:
            # Normalize ratings for a given user by subtracting row mean (centered cosine, or pearson cor)
            play_count_dif = user_profiles.iloc[indices.flatten()[i],user_profiles.columns.get_loc(songID)] - np.mean(user_profiles.iloc[indices.flatten()[i],:])
            w_similarity = play_count_dif*similarity[i]
            weighted_sum += w_similarity
            
    prediction = mean_play_count + (weighted_sum/sum_of_similarity)
    print('Predicted rating for user {} -> song {}: {}'.format(userID, songID, prediction))

    return prediction

In [8]:
predict_play_count_uu('5a905f000fc1ff3df7ca807d57edb608863db05d', 'SOZZYAO12A6701FF36', user_profiles, 'cosine', 4 )

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using cosine similarity:

1: User 5696bf760215a24ba6f381b8e466828131ddf13b, with similarity of 0.24960454854408132
2: User cbf471272db7a183cf596a4cb79aa27e7d72dda1, with similarity of 0.18669122855068843
3: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.13388321848534124
4: User b98490b4e714cbdf98f4e819c150fc6eff83cb28, with similarity of 0.12962980922945277
Predicted rating for user 5a905f000fc1ff3df7ca807d57edb608863db05d -> song SOZZYAO12A6701FF36: 0.046185464411638434


0.046185464411638434

In [58]:
# filter out low play songs to get better results?
sum_col = user_profiles.sum(axis = 0)

In [66]:
import statistics

In [67]:
print(max(sum_col))
print(min(sum_col))
print(statistics.median(sum_col))
print(statistics.mean(sum_col))


1890.0
1.0
2.0
5.197863646139508


In [76]:
# how many columns have a sum of 2 or less play counts
ignore_indexes, count= [], 0
for i, value in enumerate(sum_col):
    if value <= 2:
        count += 1
        ignore_indexes.append(i)
print(count)
print(len(ignore_indexes))

16601
16601


In [74]:
user_profiles.shape

(1010, 27898)

In [77]:
# drop columns based on index list created above
df2 = user_profiles.drop(user_profiles.iloc[:, ignore_indexes], axis = 1)

In [78]:
df2.shape

(1010, 11297)

In [79]:
27898 - 16601

11297

In [82]:
df2

songID,SOAAAGQ12A8C1420C8,SOAADCB12A81C22AFA,SOAAFAC12A67ADF7EB,SOAAKFY12A6D4F7B03,SOAAKPM12A58A77210,SOAAOYI12AB01831CE,SOAAPTP12A8AE46F86,SOAAROC12A6D4FA420,SOAATLI12A8C13E319,SOAAVUV12AB0186646,...,SOZZISR12A6D4F9391,SOZZKMI12AB017F208,SOZZLZN12A8AE48D6D,SOZZNDF12A8C143567,SOZZPSS12A6D4F3C71,SOZZRHE12A6702165F,SOZZTCU12AB0182C58,SOZZTNF12A8C139916,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0039bd8483d578997718cdc0bf6c7c88b679f488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00498f4bab2bfeb17680113c7d9525ad5b0ad401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00a443baf550f4bbdd974ba73720abf2759166f3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0152fcbd02b172a874c75a57a913f0f0109ba272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe979a7b199de3ee8a78486c10e5ed13587fc359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fed37c4c49c9f217b3371c2f2c0e7541656e55cf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ff4322e94814d3c7895d07e6f94139b092862611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:

# now repeat prediction for user ff4322e94814d3c7895d07e6f94139b092862611 and song SOAADCB12A81C22AFA
predict_play_count_uu('00a443baf550f4bbdd974ba73720abf2759166f3', 'SOAADCB12A81C22AFA', df2, 'cosine', 10 )

10 most similar users to user 00a443baf550f4bbdd974ba73720abf2759166f3, using cosine similarity:

1: User 176b5e98c1f01dba146ce8c6e15cf6fc344485ec, with similarity of 0.3153445573487389
2: User 07caa920795cd4f20bfeeb0e192a5ddd9566ecdd, with similarity of 0.3000190494332986
3: User 38c11af0c42bb21cf5b9ffb535f76c7967241b52, with similarity of 0.21962073608048815
4: User 49917cc2a92adfa6e6637bcd9e7f6756bbe51be1, with similarity of 0.19676150771782186
5: User 4e32e7efbea85d6f73a1cad6abb9a0863973d433, with similarity of 0.19579893574797724
6: User 878bdaeb36c5de29e4b9224332c93dde3e1c6e5d, with similarity of 0.19071273661496624
7: User dc777dbe9ced6ddb5b813e7e0bbb718171b87870, with similarity of 0.1542560241214721
8: User de4cec62bfb6a7be09ba1123bf03c315377be7fd, with similarity of 0.1542069601954823
9: User 9fbc0cc4fe6191cabdddf41124bd507dca08ceb6, with similarity of 0.14771764147848476
10: User bb84b605789d898993e2c6fbda4d57a8bc8da369, with similarity of 0.14445748309144124
Predicted ratin

0

# 30-11

Load libraries

In [3]:
import numpy as np
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
import pickle

Read-in the data

In [4]:
# Read the user tastes' dataset

with ZipFile('users_cleaned.zip','r') as zip:
    data = zip.read('out.csv')

users_cleaned = pd.read_csv(BytesIO(data))
print(len(users_cleaned))
users_cleaned.head()

697064


Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SODCXXY12AB0187452,2
2,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1
3,b64cdd1a0bd907e5e00b39e345194768e330d652,SONQBUB12A6D4F8ED0,2
4,5a905f000fc1ff3df7ca807d57edb608863db05d,SOFKTPP12A8C1385CA,1


In [5]:
# create the utility matrix
user_profiles = users_cleaned.pivot(index='userID', columns='songID', values='play_count')

In [9]:
user_profiles.head()

songID,SOAAAQN12AB01856D3,SOAANKE12A8C13CF5C,SOAASSD12AB0181AA6,SOABLAF12AB018E1D9,SOABRXK12A8C130A36,SOABTKM12A8AE4721E,SOABVPU12AB018AA22,SOABVWD12A58A7C3FF,SOACEDS12A6701EAAA,SOACFRH12A8C13E183,...,SOZWCKB12AB0186C5B,SOZWECJ12A6D4F5229,SOZWVCA12A6D4F9774,SOZXHBQ12AB0186626,SOZXTKD12A8C13FC43,SOZYPNV12A6701E3B8,SOZYZDZ12AB01873CA,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZVMW12AB0183B52
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00001638d6189236866af9bbf309ae6c2347ffdc,,,,,,,,,,,...,,,,,,,,,,
00004fb90a86beb8bed1e9e328f5d9b6ee7dc03e,,,,,,,,,,,...,,,,,,,,,,
000060ca4e6bea0a5c9037fc1bbd7bbabb98c754,,,,,,,,,,,...,,,,,,,,,,
00009d93dc719d1dbaf13507725a03b9fdeebebb,,,,,,,,,,,...,,,,,,,,,,
0000bb531aaa657c932988bc2f7fd7fc1b2050ec,,,,,,,,,,,...,,,,,,,,,,


In [8]:
user_profiles.shape # we have 386670 users and 3195 songs

(386670, 3195)


In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances


In [15]:
# Trying to work around the nan values existing in the utility matrix
# here I am implementing a cosine distance metric that does not take into
# consideration nan's. But it is still not accepting Nan's.
# I also tried with sklearn.metrics.pairwise.nan_euclidean_distances : not working


# def my_metric(X,Y):
#     dist = np.nansum(X*Y)/np.norm(a)*np.norm(b)
#     return dist

# knn = NearestNeighbors(metric = my_metric , algorithm = 'auto')
# knn.fit(user_profiles.values)

In [35]:
# keep 10k users and replace the NaN with 0s. 
u1 = user_profiles[:10000]

In [38]:
np.nan_to_num(u1,copy=False)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
# u_profiles contains 5 k users
# u1 contains 10k users

songID,SOAAAQN12AB01856D3,SOAANKE12A8C13CF5C,SOAASSD12AB0181AA6,SOABLAF12AB018E1D9,SOABRXK12A8C130A36,SOABTKM12A8AE4721E,SOABVPU12AB018AA22,SOABVWD12A58A7C3FF,SOACEDS12A6701EAAA,SOACFRH12A8C13E183,...,SOZWCKB12AB0186C5B,SOZWECJ12A6D4F5229,SOZWVCA12A6D4F9774,SOZXHBQ12AB0186626,SOZXTKD12A8C13FC43,SOZYPNV12A6701E3B8,SOZYZDZ12AB01873CA,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZVMW12AB0183B52
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00001638d6189236866af9bbf309ae6c2347ffdc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00004fb90a86beb8bed1e9e328f5d9b6ee7dc03e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000060ca4e6bea0a5c9037fc1bbd7bbabb98c754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00009d93dc719d1dbaf13507725a03b9fdeebebb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000bb531aaa657c932988bc2f7fd7fc1b2050ec,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
06b3d85b93084237374b04a9c228c2339fa20cc9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
06b40fcbefcb60d9211ef221fb2a34a19676d96a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
06b4396e8981c5deb5c381d116f103e4c798fff9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
06b45679bef48d79a5412afa9c5475650c55b002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.impute import KNNImputer

In [20]:
#define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [21]:
# fit on the dataset
imputer.fit(u_profiles)

In [22]:
# transform the dataset
Utrans =  imputer.transform(u_profiles)

In [82]:
Utrans

array([[1.        , 3.125     , 2.5       , ..., 1.        , 2.66666667,
        1.5       ],
       [1.        , 1.2       , 2.5       , ..., 1.        , 2.66666667,
        1.5       ],
       [1.        , 3.125     , 2.5       , ..., 1.        , 2.66666667,
        1.5       ],
       ...,
       [1.        , 3.125     , 2.5       , ..., 1.        , 2.66666667,
        1.5       ],
       [1.        , 1.2       , 2.5       , ..., 1.        , 2.66666667,
        1.5       ],
       [1.        , 1.2       , 2.5       , ..., 1.        , 2.66666667,
        1.5       ]])

In [25]:
from sklearn.neighbors import NearestNeighbors

def get_similarusers(userID, user_profiles, similarity_metric , k):
    '''Find k most similar users to a given userID'''
    similarity = list()
    neigh_ind = list()
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(user_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(user_profiles.loc[userID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar users to user {}, using {} similarity:\n'.format(k, userID, similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        if user_profiles.index[neigh_ind.flatten()[i]] == userID:
            continue;
        else:
            print('{}: User {}, with similarity of {}'.format(i, user_profiles.index[neigh_ind.flatten()[i]], similarity.flatten()[i]))
            
    return similarity,neigh_ind

In [57]:
u1.head()

songID,SOAAAQN12AB01856D3,SOAANKE12A8C13CF5C,SOAASSD12AB0181AA6,SOABLAF12AB018E1D9,SOABRXK12A8C130A36,SOABTKM12A8AE4721E,SOABVPU12AB018AA22,SOABVWD12A58A7C3FF,SOACEDS12A6701EAAA,SOACFRH12A8C13E183,...,SOZWCKB12AB0186C5B,SOZWECJ12A6D4F5229,SOZWVCA12A6D4F9774,SOZXHBQ12AB0186626,SOZXTKD12A8C13FC43,SOZYPNV12A6701E3B8,SOZYZDZ12AB01873CA,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZVMW12AB0183B52
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00001638d6189236866af9bbf309ae6c2347ffdc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00004fb90a86beb8bed1e9e328f5d9b6ee7dc03e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000060ca4e6bea0a5c9037fc1bbd7bbabb98c754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00009d93dc719d1dbaf13507725a03b9fdeebebb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000bb531aaa657c932988bc2f7fd7fc1b2050ec,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
user = '06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a'
#user = '00038cf792e9f9a1cb593dea5779f96195aac68c'
#user = '0002b896949cb2899feaed47104406e99eafa983' 
#

In [65]:
similarities,indices = get_similarusers( user , u1 , similarity_metric = 'correlation', k = 10)

10 most similar users to user 06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a, using correlation similarity:

1: User 04aee86b83a249a84d267474ca47a3be98c50016, with similarity of 0.9999999999997699
2: User 0662de274cd00814ef7156f3c906eca82866ffe9, with similarity of -0.0003130870381964357
3: User 0444a16baa033a9407814f83e4fc77b83ba8f0e5, with similarity of -0.0003130870381964357
4: User 030bcadb27f719932e7f8608bb0e29c0864635e8, with similarity of -0.0003130870381964357
5: User 0467a3e9602c043c5ae24ff3541090d60568ef74, with similarity of -0.0003130870381964357
6: User 01d6bbe45959b37bf823999e16a32b72ca5ccc60, with similarity of -0.0003130870381964357
7: User 0689759327822b798dfa9ae273530afc300ece4f, with similarity of -0.0003130870381964357
8: User 013cd8e206fae2536badb08781ecd2847821043f, with similarity of -0.0003130870381964357
9: User 05502ee1aa29d013ddec9853f8fbaacee0e34e88, with similarity of -0.0003130870381964357
10: User 033a570ca52ae9195e780dcb98482850624514ab, with similarity of -0.

In [174]:
def predict_playcount_uu(userID, songID, user_profiles, similarity_metric, k):
    '''Predict play count for a particular user-song tuple, based on user-to-user similarity. Use with cosine similarity.'''
    prediction = 0
    similarity, indices = get_similarusers(userID, user_profiles, similarity_metric, k) #similar users based on cosine similarity
    # get mean play count for a user, to adjust
    mean_play_count = user_profiles.loc[userID, :].mean() 
    # weight_i is the similarity of neigbhor_i to user X
    sum_of_similarity = np.sum(similarity) - 1 # -1 because user 1 is included, has a similarity of 1
    
    # initializing variables
    w_similarity = 1
    weighted_sum = 0
  
                                                                                              
    for i in range(0, len(indices.flatten())):
        if user_profiles.index[indices.flatten()[i]] == userID:
            continue;
        else:
            # Normalize ratings for a given user by subtracting row mean (centered cosine, or pearson cor)
            play_count_dif = user_profiles.iloc[indices.flatten()[i],user_profiles.columns.get_loc(songID)] - np.mean(user_profiles.iloc[indices.flatten()[i],:])
            w_similarity = play_count_dif*similarity[i]
            weighted_sum += w_similarity
            
    prediction = mean_play_count + (weighted_sum/sum_of_similarity)
    print('Predicted rating for user {} -> song {}: {}'.format(userID, songID, prediction))

    return prediction

In [77]:
song = 'SOAPNML12A8C13B696'
#song = 'SOSHUVD12A6701F8F9'

In [93]:
predict_play_count_uu(user , song , u1, 'cosine', 10 )

10 most similar users to user 06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a, using cosine similarity:

0: User 04aee86b83a249a84d267474ca47a3be98c50016, with similarity of 1.0
2: User 047dd7b194e93ef872131502b558e28bb117c8d7, with similarity of 0.0
3: User 047e058275e0bf2aa8baecc353dfbc53b290abe2, with similarity of 0.0
4: User 047cc7c59d227f42607884d13221c51b3a1ebe81, with similarity of 0.0
5: User 047cc9b1e0ffcf5be3e4352c98219b0335d79ed8, with similarity of 0.0
6: User 047d1656d5b6434cd2745b44a8837fd74efff4d7, with similarity of 0.0
7: User 047da1a39c07ca2849b42fb25d157d591c860950, with similarity of 0.0
8: User 047dc87cda681c0ce1c6e6eff0be595bfe7ca363, with similarity of 0.0
9: User 047dd29561be5eec32c3366b08901731be61391c, with similarity of 0.0
10: User 047c08097da03449e19fa0263c2c1f9b6c7e7c5a, with similarity of 0.0
Predicted rating for user 06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a -> song SOAPNML12A8C13B696: -0.00970266040688576


-0.00970266040688576

## item based 

In [106]:

def get_similar_songs(songID, user_profiles, similarity_metric , k):
    '''Find k most similar songs to a given songID'''
    similarity = list()
    neigh_ind = list()
    song_profiles=user_profiles.T
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(song_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(song_profiles.loc[songID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar songs to song {}, using {} similarity:\n'.format(k, songID, similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        if song_profiles.index[neigh_ind.flatten()[i]] == songID:
            continue;
        else:
            print('{}: Song {}, with similarity of {}'.format(i, song_profiles.index[neigh_ind.flatten()[i]], similarity.flatten()[i]))
            
    return similarity,neigh_ind

In [95]:
similarities,indices = get_similar_songs(song, u1, similarity_metric = 'correlation', k = 15)

15 most similar songs to song SOAPNML12A8C13B696, using correlation similarity:

1: User SOGJPMB12A8C13A9DB, with similarity of 0.07731937458471294
2: User SOKNWHQ12AB017CC6C, with similarity of -0.00010001000100001711
3: User SOKRMXG12A8C13AB54, with similarity of -0.00010001000100001711
4: User SOCVDYI12AC4687DDB, with similarity of -0.00010001000100001711
5: User SOKQYMF12AB0180F2A, with similarity of -0.00010001000100001711
6: User SOKPXQU12AB0187ED6, with similarity of -0.00010001000100001711
7: User SOKPGFN12A8C143481, with similarity of -0.00010001000100001711
8: User SOKSRWI12A8C146997, with similarity of -0.00010001000100001711
9: User SOKODRA12A8C13B4B5, with similarity of -0.00010001000100001711
10: User SOKLDSG12AB0188032, with similarity of -0.00010001000100001711
11: User SOKMFTY12AB0183216, with similarity of -0.00010001000100001711
12: User SOKKMRS12A67ADD134, with similarity of -0.00010001000100001711
13: User SOKWQCX12AF729F29A, with similarity of -0.00010001000100001

In [107]:
#This function predicts the rating for specified user-item combination based on item-based approach
def predict_playcount_ii(userID, songID, user_profiles, similarity_metric , k):
    '''Predict play count for a particular user-song tuple, based on item-to-item similarity. Use with correlation coefficients.'''
    prediction= 0 
    similarities, indices=get_similar_songs(songID,  user_profiles, similarity_metric,k) #similar items based on correlation coefficients
    
    # weight_i is the similarity of neigbhor_i to user X
    sum_of_similarity = np.sum(similarities)-1 # -1 because user 1 is included, has a similarity of 1
    
    # initializing variables
    w_similarity = 1
    weighted_sum = 0
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == songID:
            continue;
        else:
            w_similarity = user_profiles.iloc[user_profiles.index.get_loc(userID),indices.flatten()[i]] * (similarities[i])
            weighted_sum += w_similarity
                                        
    prediction = int(round(weighted_sum/sum_of_similarity))
    print('\nPredicted rating for user {} -> item {}: {}'.format(userID,songID,prediction) )    

    return prediction

In [108]:
prediction = predict_playcount_ii(user, song, u1, 'correlation', 2)

2 most similar songs to song SOAPNML12A8C13B696, using correlation similarity:

1: Song SOGJPMB12A8C13A9DB, with similarity of 0.07731937458471294
2: Song SOAASSD12AB0181AA6, with similarity of -0.00010001000100001711

Predicted rating for user 06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a -> item SOAPNML12A8C13B696: 0


Difference between Pearson's correlation and adjusted cosine correlation:

   - In pearson correlation, the mean which subtracted is about the particular item itself (ratings from all users), mean(Ri)
   - In adjusted cosine correlation, the mean is about the particular user (ratings to all items), mean(Ru)


In [204]:
from scipy.spatial.distance import pdist, squareform
def get_adj_cosine_M(user_profiles):
    M = user_profiles.to_numpy()
    M_u = M.mean(axis=1)
    item_mean_subtracted = M - M_u[:, None]
    similarity_matrix = 1 - squareform(pdist(item_mean_subtracted.T, 'cosine'))
    
    return pd.DataFrame(similarity_matrix, index=user_profiles.columns, columns=user_profiles.columns)

In [131]:
adjcos_sim = get_adj_cosine_M(u1)

In [132]:
adjcos_sim.head()

songID,SOAAAQN12AB01856D3,SOAANKE12A8C13CF5C,SOAASSD12AB0181AA6,SOABLAF12AB018E1D9,SOABRXK12A8C130A36,SOABTKM12A8AE4721E,SOABVPU12AB018AA22,SOABVWD12A58A7C3FF,SOACEDS12A6701EAAA,SOACFRH12A8C13E183,...,SOZWCKB12AB0186C5B,SOZWECJ12A6D4F5229,SOZWVCA12A6D4F9774,SOZXHBQ12AB0186626,SOZXTKD12A8C13FC43,SOZYPNV12A6701E3B8,SOZYZDZ12AB01873CA,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZVMW12AB0183B52
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SOAAAQN12AB01856D3,1.0,0.29244,-0.007602,-0.002954,0.020376,-0.006374,0.29244,0.087016,0.002715,0.29244,...,0.044596,0.025695,0.087016,0.049976,0.087277,0.29244,0.29244,0.004949,0.044596,0.033585
SOAANKE12A8C13CF5C,0.29244,1.0,-0.024976,-0.009122,0.069797,-0.020824,1.0,0.297635,0.009959,1.0,...,0.152583,0.088186,0.297635,0.171125,0.298499,1.0,1.0,0.018078,0.152583,0.115555
SOAASSD12AB0181AA6,-0.007602,-0.024976,1.0,-0.000938,-0.00189,-0.000636,-0.024976,-0.007533,-0.001051,-0.024976,...,-0.003914,-0.002584,-0.007533,-0.00455,-0.007521,-0.024976,-0.024976,-0.001824,-0.003914,-0.003732
SOABLAF12AB018E1D9,-0.002954,-0.009122,-0.000938,1.0,-0.000778,-0.000922,-0.009122,-0.002811,-0.000863,-0.009122,...,-0.001491,-0.001172,-0.002811,-0.001826,-0.002787,-0.009122,-0.009122,-0.001485,-0.001491,-0.001868
SOABRXK12A8C130A36,0.020376,0.069797,-0.00189,-0.000778,1.0,-0.001593,0.069797,0.020762,0.000598,0.069797,...,0.010637,0.006109,0.020762,0.011911,0.020826,0.069797,0.069797,0.001096,0.010637,0.007963


In [140]:
indices = adjcos_sim['SOAAAQN12AB01856D3'].sort_values(ascending=False)
indices

songID
SOAAAQN12AB01856D3    1.000000
SONLCFV12A6D4F8AC3    0.292440
SONGITI12AF72A2AF5    0.292440
SONGVVU12AB018719F    0.292440
SONHFAB12AB018B767    0.292440
                        ...   
SOGOAGM12AB017E99E   -0.054432
SOVAGPG12AB0189963   -0.054536
SOJHGZJ12AB0187E31   -0.068171
SOIZLKI12A6D4F7B61   -0.068326
SOVTGXQ12A6D4FB9CA   -0.098233
Name: SOAAAQN12AB01856D3, Length: 3195, dtype: float64

In [153]:
indices = adjcos_sim['SOAAAQN12AB01856D3'].sort_values(ascending=False)[:1+1].index

In [154]:
indices

Index(['SOAAAQN12AB01856D3', 'SONLCFV12A6D4F8AC3'], dtype='object', name='songID')

In [164]:
#This function finds k similar items given the item_id and ratings matrix M

def get_similar_songs_adjcos(songID, user_profiles, k):
    
    sim_matrix = get_adj_cosine_M(user_profiles)
    # sort the similarities and grab k highest values
    similarities = sim_matrix[songID].sort_values(ascending=False)[:k+1].values
    # grab the songIDs
    indices = sim_matrix[songID].sort_values(ascending=False)[:k+1].index
    
    print('{} most similar items for item {}:\n'.format(k,songID))
    for i in range(0, len(indices)):
            # first index is songID by default
            if indices[i] == songID:
                continue;

            else:
                print('{}: Song {} , with similarity of {}'.format(i,indices[i], similarities[i]))
        
    return similarities ,indices

In [163]:


similarities, indices = get_similar_songs_adjcos(song, u1, 5)

5 most similar items for item SOAPNML12A8C13B696:

1: Song SOJATJX12A6D4F6E77 , with similarity of 0.2984992012966514
2: Song SOJBYED12A6D4FBE81 , with similarity of 0.2984992012966514
3: Song SOJBVYM12AB018BD69 , with similarity of 0.2984992012966514
4: Song SOSFURN12A58A7AC5F , with similarity of 0.2984992012966514
5: Song SOJBTHL12A6D4F7409 , with similarity of 0.2984992012966514


NameError: name 'indice' is not defined

In [206]:
indices.to_list()

['SOAAAQN12AB01856D3', 'SONLCFV12A6D4F8AC3']

In [177]:
#This function predicts the rating for specified user-item combination for adjusted cosine item-based approach

def predict_ii_adjcos(userID, songID, user_profiles, k):
    pred=0

    similarities, indices=get_similar_songs_adjcos(songID, user_profiles, k) 
    sum_of_similarity = np.sum(similarities)-1

    w_similarity=1
    weighted_sum = 0 
    for i in range(0, len(indices)):
        if indices[i] == songID:
            continue;
        else:
            w_similarity = user_profiles.loc[userID,indices[i]] * (similarities[i])
            weighted_sum += w_similarity                              
    pred = int(round(weighted_sum/sum_of_similarity))

    print('\nPredicted rating for user {} -> item {}: {}'.format(userID,songID,pred))
        
    return pred

In [178]:
prediction = predict_ii_adjcos(user, song, u1, 5)

5 most similar items for item SOAPNML12A8C13B696:

1: Song SOJATJX12A6D4F6E77 , with similarity of 0.2984992012966514
2: Song SOJBYED12A6D4FBE81 , with similarity of 0.2984992012966514
3: Song SOJBVYM12AB018BD69 , with similarity of 0.2984992012966514
4: Song SOSFURN12A58A7AC5F , with similarity of 0.2984992012966514
5: Song SOJBTHL12A6D4F7409 , with similarity of 0.2984992012966514

Predicted rating for user 06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a -> item SOAPNML12A8C13B696: 0


In [181]:
import ipywidgets as widgets
#This function utilizes above function to recommend items for selected approach. Recommendations are made if the predicted
#rating for an item is greater than or equal to 6, and the items has not been rated already
def recommendItem(userID, songID, user_profiles, k):
    
    if userID not in user_profiles.index:
        print('User does not exist. Try a different user')
    else:    
        ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)',
               'Item-based CF (adjusted cosine)']

        approach = widgets.Dropdown(options=ids, value=ids[0],
                               description='Select Approach', width='500px')
        
        def on_change(change):
            prediction = 0
            clear_output(wait=True)
            if change['type'] == 'change' and change['name'] == 'value':            
                if (approach.value == 'User-based CF (cosine)'):
                    metric = 'cosine'
                    prediction = predict_playcount_uu(userID, songD, user_profiles, metric)
                elif (approach.value == 'User-based CF (correlation)')  :                       
                    metric = 'correlation'               
                    prediction = predict_playcount_uu(userID, songD, user_profiles, metric)
                elif (approach.value == 'Item-based CF (cosine)'):
                    prediction = predict_playcount_ii(userID, songD, user_profiles,)
                else:
                    prediction = predict_ii_adjcos(userID, songD, user_profiles,)

                if user_profiles[songID][userID] != 0: 
                    print('Item already rated')
                else:
                    if prediction>=0:
                        print('\nItem recommended')
                    else:
                        print('Item not recommended')

        approach.observe(on_change)
        display(approach)

ModuleNotFoundError: No module named 'ipywidgets'

In [182]:
recommendItem(user,song,u1, 3)

NameError: name 'widgets' is not defined

In [194]:
idx = u1.index.get_loc(user)
u1.iloc[:, np.argsort(-u1.values[idx])[:5]]

songID,SOCYIJP12AB018135F,SOQJLFV12AB01897C7,SOQJPYF12AF72AA8E2,SOQJWZI12A8C140181,SOQJYCE12A6D4F4844
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00001638d6189236866af9bbf309ae6c2347ffdc,0.0,0.0,0.0,0.0,0.0
00004fb90a86beb8bed1e9e328f5d9b6ee7dc03e,0.0,0.0,0.0,0.0,0.0
000060ca4e6bea0a5c9037fc1bbd7bbabb98c754,0.0,0.0,0.0,0.0,0.0
00009d93dc719d1dbaf13507725a03b9fdeebebb,0.0,0.0,0.0,0.0,0.0
0000bb531aaa657c932988bc2f7fd7fc1b2050ec,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
06b3d85b93084237374b04a9c228c2339fa20cc9,0.0,0.0,0.0,0.0,0.0
06b40fcbefcb60d9211ef221fb2a34a19676d96a,0.0,0.0,0.0,0.0,0.0
06b4396e8981c5deb5c381d116f103e4c798fff9,0.0,0.0,0.0,0.0,0.0
06b45679bef48d79a5412afa9c5475650c55b002,0.0,0.0,0.0,0.0,0.0


In [196]:
np.argsort(-u1.values[idx])[:5]

array([ 436, 2124, 2125, 2126, 2127], dtype=int64)

In [203]:
u1.iloc[idx, np.argsort(-u1.values[idx])[:5]].index

Index(['SOCYIJP12AB018135F', 'SOQJLFV12AB01897C7', 'SOQJPYF12AF72AA8E2',
       'SOQJWZI12A8C140181', 'SOQJYCE12A6D4F4844'],
      dtype='object', name='songID')

In [195]:
u1.values[idx]

array([0., 0., 0., ..., 0., 0., 0.])

In [213]:
#This function finds k similar items given the item_id and adjusted cosine matrix M

def get_similar_songs_adj(songID, adj_sim_m , k):
    
    # sort the similarities and grab k highest values
    similarities = adj_sim_m [songID].sort_values(ascending=False)[:k+1].values
    # grab the songIDs
    indices = adj_sim_m [songID].sort_values(ascending=False)[:k+1].index
    
    print('{} most similar items for item {}:\n'.format(k,songID))
    for i in range(0, len(indices)):
             first index is songID by default
            if indices[i] == songID:
                continue;

            else:
                print('{}: Song {} , with similarity of {}'.format(i,indices[i], similarities[i]))
        
    return similarities ,indices.to_list()

In [227]:
def recommend10Items(user_list, database):
    result = dict()
    # compute adjusted cosine similarity matrix
    sim_matrix = get_adj_cosine_M(database)
    
    for userID in user_list:
        # find row corresponding to user in database
        idx = database.index.get_loc(userID)
        #get the top 5 songs that he already likes
        likes = database.iloc[idx, np.argsort(-database.values[idx])[:5]].index
        
        for songID in likes:
            similarities, recommendations = get_similar_songs_adj(songID, sim_matrix, 1)
            
            if userID not in result:
                result[userID] = recommendations
            elif type(result[userID]) == list:
                result[userID].append(recommendations)
            else:
                result[userID] = [result[userID], recommendations]
    
    return result

In [215]:
samplelist = ['06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a',
              '00038cf792e9f9a1cb593dea5779f96195aac68c',
              '0002b896949cb2899feaed47104406e99eafa983' ]


In [228]:
recomend = recommend10Items(samplelist, u1)

In [229]:
recomend

{'06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a': ['SOCYIJP12AB018135F',
  'SOHZXJH12A6D4F3B87',
  ['SOQJLFV12AB01897C7', 'SOLTAOU12A8C1375CB'],
  ['SOQJPYF12AF72AA8E2', 'SOPPBPK12A8C14683C'],
  ['SOLTAOU12A8C1375CB', 'SOSJIMT12A8C137360'],
  ['SOQJYCE12A6D4F4844', 'SOIKYQE12A81356CFD']],
 '00038cf792e9f9a1cb593dea5779f96195aac68c': ['SOIZLKI12A6D4F7B61',
  'SOVAGPG12AB0189963',
  ['SOAAAQN12AB01856D3', 'SONLCFV12A6D4F8AC3'],
  ['SOQJLFV12AB01897C7', 'SOLTAOU12A8C1375CB'],
  ['SOQJPYF12AF72AA8E2', 'SOPPBPK12A8C14683C'],
  ['SOLTAOU12A8C1375CB', 'SOSJIMT12A8C137360']],
 '0002b896949cb2899feaed47104406e99eafa983': ['SOSQIHH12A8C13370B',
  'SOQLJFO12A6D4F7503',
  ['SOLTAOU12A8C1375CB', 'SOSJIMT12A8C137360'],
  ['SOQJLFV12AB01897C7', 'SOLTAOU12A8C1375CB'],
  ['SOQJPYF12AF72AA8E2', 'SOPPBPK12A8C14683C'],
  ['SOLTAOU12A8C1375CB', 'SOSJIMT12A8C137360']]}

In [238]:

def get_similar_songs_kNN(songID, user_profiles, similarity_metric , k):
    '''Find k most similar songs to a given songID'''
    similarity = list()
    neigh_ind = list()
    song_profiles=user_profiles.T
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(song_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(song_profiles.loc[songID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    
    similar_songs = []
    for i in range(0,len(neigh_ind.flatten())):
        if song_profiles.index[neigh_ind.flatten()[i]] == songID:
            continue;
        else:
            similar_songs.append(song_profiles.index[neigh_ind.flatten()[i]])
            
    return similar_songs

In [239]:
def recommend10Items_kNN(user_list, database):
    result = dict()
    
    for userID in user_list:
        # find row corresponding to user in database
        idx = database.index.get_loc(userID)
        #get the top 5 songs that he already likes
        likes = database.iloc[idx, np.argsort(-database.values[idx])[:5]].index
        
        for songID in likes:
            simsongs= get_similar_songs_kNN(songID, database, 'cosine', 2)
            
            if userID not in result:
                result[userID] = simsongs
            elif type(result[userID]) == list:
                result[userID].append(simsongs)
            else:
                result[userID] = [result[userID], simsongs]
    
    return result

In [240]:
recomend_kNN = recommend10Items_kNN(samplelist, u1)

In [241]:
recomend_kNN

{'06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a': ['SOAANKE12A8C13CF5C',
  'SOAASSD12AB0181AA6',
  ['SOAANKE12A8C13CF5C', 'SOAASSD12AB0181AA6'],
  ['SOPPBPK12A8C14683C', 'SOFSDOM12AB01830F1'],
  ['SOAAAQN12AB01856D3', 'SOAANKE12A8C13CF5C', 'SOAASSD12AB0181AA6'],
  ['SOIKYQE12A81356CFD', 'SOMOHHA12A81356D12']],
 '00038cf792e9f9a1cb593dea5779f96195aac68c': ['SOVAGPG12AB0189963',
  'SORJVDO12AF72A1970',
  ['SOBJUKG12A58A7DCA8', 'SOCSXZF12AC468D3E3'],
  ['SOAANKE12A8C13CF5C', 'SOAASSD12AB0181AA6'],
  ['SOPPBPK12A8C14683C', 'SOFSDOM12AB01830F1'],
  ['SOAAAQN12AB01856D3', 'SOAANKE12A8C13CF5C', 'SOAASSD12AB0181AA6']],
 '0002b896949cb2899feaed47104406e99eafa983': ['SOQLJFO12A6D4F7503',
  'SONJMCI12AB018B16A',
  ['SOAAAQN12AB01856D3', 'SOAANKE12A8C13CF5C', 'SOAASSD12AB0181AA6'],
  ['SOAANKE12A8C13CF5C', 'SOAASSD12AB0181AA6'],
  ['SOPPBPK12A8C14683C', 'SOFSDOM12AB01830F1'],
  ['SOAAAQN12AB01856D3', 'SOAANKE12A8C13CF5C', 'SOAASSD12AB0181AA6']]}