# Item based collaborative filtering (Angeliki)

<sup>Inspired by https://github.com/csaluja/JupyterNotebooks-Medium/blob/master/CF%20Recommendation%20System-Examples.ipynb<sup>

Load libraries

In [1]:
import numpy as np
import pandas as pd
from io import BytesIO
from zipfile import ZipFile

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import pdist, squareform



Read-in the data

In [2]:
# Read the user tastes' dataset

with ZipFile('users_cleaned.zip','r') as zip:
    data = zip.read('out.csv')

users_cleaned = pd.read_csv(BytesIO(data))
print(len(users_cleaned))
users_cleaned.head()

697064


Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SODCXXY12AB0187452,2
2,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1
3,b64cdd1a0bd907e5e00b39e345194768e330d652,SONQBUB12A6D4F8ED0,2
4,5a905f000fc1ff3df7ca807d57edb608863db05d,SOFKTPP12A8C1385CA,1


In [3]:
# create the utility matrix
user_profiles = users_cleaned.pivot(index='userID', columns='songID', values='play_count')

# Includes 386670 users and 3195 songs
user_profiles.shape 

(386670, 3195)

In [4]:
# Narrow down to 10k users and replace the NaN with 0s. 
u1 = user_profiles[:10000]
np.nan_to_num(u1,copy=False)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
# users and songs to experiment with
user = '06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a'
#user = '00038cf792e9f9a1cb593dea5779f96195aac68c'
#user = '0002b896949cb2899feaed47104406e99eafa983'
song = 'SOAPNML12A8C13B696'
#song = 'SOSHUVD12A6701F8F9'

samplelist = ['06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a',
              '00038cf792e9f9a1cb593dea5779f96195aac68c',
              '0002b896949cb2899feaed47104406e99eafa983' ]

**I am implementing item based collaborative filtering as it outperforms user based and items are simpler than user tastes**

### Recommend 10 songs to a list of users using adjusted cosine correlation

Difference between Pearson's correlation and adjusted cosine correlation:

   - In pearson correlation, the mean which subtracted is about the particular item itself (ratings from all users), mean(Ri)
   - In adjusted cosine correlation, the mean is about the particular user (ratings to all items), mean(Ru)



In [6]:
# This function computes a adjusted cosine correlation matrix from a utility matrix
def get_adj_cosine_M(user_profiles):
    M = user_profiles.to_numpy()
    M_u = M.mean(axis=1)
    item_mean_subtracted = M - M_u[:, None]
    similarity_matrix = 1 - squareform(pdist(item_mean_subtracted.T, 'cosine'))
    
    return pd.DataFrame(similarity_matrix, index=user_profiles.columns, columns=user_profiles.columns)

In [7]:
adjcos_sim = get_adj_cosine_M(u1)
adjcos_sim.head()

songID,SOAAAQN12AB01856D3,SOAANKE12A8C13CF5C,SOAASSD12AB0181AA6,SOABLAF12AB018E1D9,SOABRXK12A8C130A36,SOABTKM12A8AE4721E,SOABVPU12AB018AA22,SOABVWD12A58A7C3FF,SOACEDS12A6701EAAA,SOACFRH12A8C13E183,...,SOZWCKB12AB0186C5B,SOZWECJ12A6D4F5229,SOZWVCA12A6D4F9774,SOZXHBQ12AB0186626,SOZXTKD12A8C13FC43,SOZYPNV12A6701E3B8,SOZYZDZ12AB01873CA,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZVMW12AB0183B52
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SOAAAQN12AB01856D3,1.0,0.29244,-0.007602,-0.002954,0.020376,-0.006374,0.29244,0.087016,0.002715,0.29244,...,0.044596,0.025695,0.087016,0.049976,0.087277,0.29244,0.29244,0.004949,0.044596,0.033585
SOAANKE12A8C13CF5C,0.29244,1.0,-0.024976,-0.009122,0.069797,-0.020824,1.0,0.297635,0.009959,1.0,...,0.152583,0.088186,0.297635,0.171125,0.298499,1.0,1.0,0.018078,0.152583,0.115555
SOAASSD12AB0181AA6,-0.007602,-0.024976,1.0,-0.000938,-0.00189,-0.000636,-0.024976,-0.007533,-0.001051,-0.024976,...,-0.003914,-0.002584,-0.007533,-0.00455,-0.007521,-0.024976,-0.024976,-0.001824,-0.003914,-0.003732
SOABLAF12AB018E1D9,-0.002954,-0.009122,-0.000938,1.0,-0.000778,-0.000922,-0.009122,-0.002811,-0.000863,-0.009122,...,-0.001491,-0.001172,-0.002811,-0.001826,-0.002787,-0.009122,-0.009122,-0.001485,-0.001491,-0.001868
SOABRXK12A8C130A36,0.020376,0.069797,-0.00189,-0.000778,1.0,-0.001593,0.069797,0.020762,0.000598,0.069797,...,0.010637,0.006109,0.020762,0.011911,0.020826,0.069797,0.069797,0.001096,0.010637,0.007963


In [10]:
# This function finds k similar songs given songID and adjusted cosine matrix
def get_similar_songs_adjcosine(songID, adj_sim_m , k):
    '''Find k similar songs given songID and adjusted cosine matrix '''
    
    # sort the similarities and grab k highest values
    similarities = adj_sim_m [songID].sort_values(ascending=False)[:k+1].values
    # grab the songIDs
    indices = adj_sim_m [songID].sort_values(ascending=False)[:k+1].index
    
    print('{} most similar items for item {}:\n'.format(k,songID))
    for i in range(0, len(indices)):
            #first index is songID by default
            if indices[i] == songID:
                continue;

            else:
                print('{}: Song {} , with similarity of {}'.format(i,indices[i], similarities[i]))
        
    return similarities ,indices.to_list()

In [11]:
#This function recommends 10 songs based on item-item collaborative filtering
# given a list of users and a utility matrix (database)
def recommend10Items(user_list, database):
    result = dict()
    # compute adjusted cosine similarity matrix
    sim_matrix = get_adj_cosine_M(database)
    
    for userID in user_list:
        # find row corresponding to user in database
        idx = database.index.get_loc(userID)
        #get the top 5 songs that he already likes
        likes = database.iloc[idx, np.argsort(-database.values[idx])[:5]].index
        
        for songID in likes:
            similarities, recommendations = get_similar_songs_adjcosine(songID, sim_matrix, 1)
            
            if userID not in result:
                result[userID] = recommendations
            elif type(result[userID]) == list:
                result[userID].append(recommendations)
            else:
                result[userID] = [result[userID], recommendations]
    
    return result

In [12]:
get_recom = recommend10Items(samplelist, u1)
get_recom

1 most similar items for item SOCYIJP12AB018135F:

1: Song SOHZXJH12A6D4F3B87 , with similarity of -0.0005469405185896115
1 most similar items for item SOQJLFV12AB01897C7:

1: Song SOLTAOU12A8C1375CB , with similarity of 0.1372249279229687
1 most similar items for item SOQJPYF12AF72AA8E2:

1: Song SOPPBPK12A8C14683C , with similarity of 0.29088700016095737
1 most similar items for item SOQJWZI12A8C140181:

0: Song SOLTAOU12A8C1375CB , with similarity of 1.0
1: Song SOSJIMT12A8C137360 , with similarity of 1.0
1 most similar items for item SOQJYCE12A6D4F4844:

1: Song SOIKYQE12A81356CFD , with similarity of 0.723010040396943
1 most similar items for item SOIZLKI12A6D4F7B61:

1: Song SOVAGPG12AB0189963 , with similarity of 0.12419616842422565
1 most similar items for item SOAAAQN12AB01856D3:

1: Song SONLCFV12A6D4F8AC3 , with similarity of 0.292439824681179
1 most similar items for item SOQJLFV12AB01897C7:

1: Song SOLTAOU12A8C1375CB , with similarity of 0.1372249279229687
1 most similar 

{'06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a': ['SOCYIJP12AB018135F',
  'SOHZXJH12A6D4F3B87',
  ['SOQJLFV12AB01897C7', 'SOLTAOU12A8C1375CB'],
  ['SOQJPYF12AF72AA8E2', 'SOPPBPK12A8C14683C'],
  ['SOLTAOU12A8C1375CB', 'SOSJIMT12A8C137360'],
  ['SOQJYCE12A6D4F4844', 'SOIKYQE12A81356CFD']],
 '00038cf792e9f9a1cb593dea5779f96195aac68c': ['SOIZLKI12A6D4F7B61',
  'SOVAGPG12AB0189963',
  ['SOAAAQN12AB01856D3', 'SONLCFV12A6D4F8AC3'],
  ['SOQJLFV12AB01897C7', 'SOLTAOU12A8C1375CB'],
  ['SOQJPYF12AF72AA8E2', 'SOPPBPK12A8C14683C'],
  ['SOLTAOU12A8C1375CB', 'SOSJIMT12A8C137360']],
 '0002b896949cb2899feaed47104406e99eafa983': ['SOSQIHH12A8C13370B',
  'SOQLJFO12A6D4F7503',
  ['SOLTAOU12A8C1375CB', 'SOSJIMT12A8C137360'],
  ['SOQJLFV12AB01897C7', 'SOLTAOU12A8C1375CB'],
  ['SOQJPYF12AF72AA8E2', 'SOPPBPK12A8C14683C'],
  ['SOLTAOU12A8C1375CB', 'SOSJIMT12A8C137360']]}

In [66]:
# flatten the recommendation list
from pandas.core.common import flatten

for user, rec in get_recom.items():
    get_recom[user] = list(flatten(rec))

user_recommendations = get_recom
user_recommendations

{'06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a': ['SOCYIJP12AB018135F',
  'SOHZXJH12A6D4F3B87',
  'SOQJLFV12AB01897C7',
  'SOLTAOU12A8C1375CB',
  'SOQJPYF12AF72AA8E2',
  'SOPPBPK12A8C14683C',
  'SOLTAOU12A8C1375CB',
  'SOSJIMT12A8C137360',
  'SOQJYCE12A6D4F4844',
  'SOIKYQE12A81356CFD'],
 '00038cf792e9f9a1cb593dea5779f96195aac68c': ['SOIZLKI12A6D4F7B61',
  'SOVAGPG12AB0189963',
  'SOAAAQN12AB01856D3',
  'SONLCFV12A6D4F8AC3',
  'SOQJLFV12AB01897C7',
  'SOLTAOU12A8C1375CB',
  'SOQJPYF12AF72AA8E2',
  'SOPPBPK12A8C14683C',
  'SOLTAOU12A8C1375CB',
  'SOSJIMT12A8C137360'],
 '0002b896949cb2899feaed47104406e99eafa983': ['SOSQIHH12A8C13370B',
  'SOQLJFO12A6D4F7503',
  'SOLTAOU12A8C1375CB',
  'SOSJIMT12A8C137360',
  'SOQJLFV12AB01897C7',
  'SOLTAOU12A8C1375CB',
  'SOQJPYF12AF72AA8E2',
  'SOPPBPK12A8C14683C',
  'SOLTAOU12A8C1375CB',
  'SOSJIMT12A8C137360']}

#### Intra-list similarity

Intra-list similarity is the average cosine similarity of all items in a list of recommendations.
Intra-list similarity can be calculated for each user, and averaged over all users in the test set to get an estimate of intra-list similarity for the model.

In [67]:
# Cosine similarity
from numpy.linalg import norm
def cosine_similarity(song1, song2):
    a = vector_representation[song1]
    b = vector_representation[song2]
    dist = np.dot(a,b)/(norm(a)*norm(b))
    return dist

# Intra list similarity function
def intra_list_similarity(user):
    rec = user_recommendations[user]
    
    # All possible pairs in list
    pairs = [(a, b) for idx, a in enumerate(rec) for b in rec[idx + 1:]]
    
    # Compute the average distances between the pairs of the recommended songs
    distances = []
    for pair in pairs:
        distances.append(cosine_similarity(pair[0], pair[1]))
    
    return np.mean(distances)

In [68]:
# Get the intra-list similarity of the model
# Average of all intra list similarities

intra_list_similar = []
for user in user_recommendations.keys():
    intra_list_similar.append(intra_list_similarity(user))

print("The intra list similarity for Item based collaborative filtering model is:" ,np.mean(intra_list_similar))

NameError: name 'vector_representation' is not defined

### Recommend 10 songs to a list of users by implementig kNN search for item based filtering

In [None]:

def get_similar_songs_kNN(songID, user_profiles, similarity_metric , k):
    '''Find k most similar songs to a given songID'''
    similarity = list()
    neigh_ind = list()
    song_profiles=user_profiles.T
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(song_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(song_profiles.loc[songID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    
    similar_songs = []
    for i in range(0,len(neigh_ind.flatten())):
        if song_profiles.index[neigh_ind.flatten()[i]] == songID:
            continue;
        else:
            similar_songs.append(song_profiles.index[neigh_ind.flatten()[i]])
            
    return similar_songs

In [None]:
def recommend10Items_kNN(user_list, database):
    result = dict()
    
    for userID in user_list:
        # find row corresponding to user in database
        idx = database.index.get_loc(userID)
        #get the top 5 songs that he already likes
        likes = database.iloc[idx, np.argsort(-database.values[idx])[:5]].index
        
        for songID in likes:
            simsongs= get_similar_songs_kNN(songID, database, 'cosine', 2)
            
            if userID not in result:
                result[userID] = simsongs
            elif type(result[userID]) == list:
                result[userID].append(simsongs)
            else:
                result[userID] = [result[userID], simsongs]
    
    return result

In [None]:
get_recom_kNN = recommend10Items_kNN(samplelist, u1)
get_recom_kNN