# Computational tools for data science: Comparing Recommendation Systems for MillionSongsDataset

In [1]:
# Imports
import math
import os
import pickle
import pprint
import re
import statistics
import time
from collections import defaultdict
from io import BytesIO
from zipfile import ZipFile

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from datasketch import MinHash, MinHashLSHForest
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import paired_distances
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
from wordcloud import STOPWORDS, ImageColorGenerator, WordCloud

import hdf5_getters as hdf5_getters

  from .utilsextension import (
  from .utilsextension import (
  min_numpy_version = LooseVersion('1.9.3')
  min_numexpr_version = LooseVersion('2.6.2')
  min_hdf5_version = LooseVersion('1.8.4')
  min_blosc_version = LooseVersion("1.4.1")
  min_blosc_bitshuffle_version = LooseVersion("1.8.0")
  blosc_version = LooseVersion(tables.which_lib_version("blosc")[1])
  hdf5_version = LooseVersion(tables.hdf5_version)
  blosc_version = LooseVersion(tables.which_lib_version("blosc")[1])


## Read the data

Assuming that you have all the data in a folder called ```data```

In [2]:
# Read the songs dataset
with ZipFile('data/songs_cleaned.zip', 'r') as zipfile:
    data = zipfile.read('out.csv')

songs_cleaned = pd.read_csv(BytesIO(data))

In [3]:
# And pre-process the artist_terms feature

def ConvertStringtoList(string):
    s = string.replace('[','')
    s = s.replace(']','')
    s = s.replace('\'','')
    s = s.replace(' ','')
    li = list(s.split(","))
    return li

i = 0
for string in songs_cleaned['artist_terms']:
    list_ = ConvertStringtoList(string)
    songs_cleaned['artist_terms'][i] = list_
    i += 1

In [4]:
songs_cleaned.head()

Unnamed: 0,artist_id,song_id,artist_terms
0,ARMJAGH1187FB546F3,SOCIWDW12A8C13D406,"[blue-eyedsoul, poprock, blues-rock, beachmusi..."
1,ARXR32B1187FB57099,SOFSOCN12A8C143F5D,"[poppunk, skapunk, breakcore, alternativemetal..."
2,AR10USD1187B99F3F1,SOHKNRJ12A6701D1F8,"[post-hardcore, screamo, emo, hardcore, punkre..."
3,ARC43071187B990240,SOKEJEJ12A8C13E0D0,"[ccm, religiousmusic, losangeles, christianroc..."
4,ARL7K851187B99ACD2,SOMUYGI12AB0188633,"[bachata, merengue, reggaeton, latinpop, spani..."


In [5]:
# Read the user tastes' dataset

with ZipFile('users_cleaned.zip', 'r') as zipfile:
    data = zipfile.read('out.csv')

users_cleaned = pd.read_csv(BytesIO(data))
print(len(users_cleaned))
users_cleaned.head()

697064


Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SODCXXY12AB0187452,2
2,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1
3,b64cdd1a0bd907e5e00b39e345194768e330d652,SONQBUB12A6D4F8ED0,2
4,5a905f000fc1ff3df7ca807d57edb608863db05d,SOFKTPP12A8C1385CA,1


In [6]:
# How many different users there are
print("We have a total of", len(set(users_cleaned['userID'])), "users.")

# Let's take a subset of users to evaluate the performance of the various recommendation systems
nb_of_users = 5000

users_subset = users_cleaned[users_cleaned['userID'].isin(list(set(users_cleaned['userID']))[:nb_of_users])]
print("In the subset for the evaluation we have a total number of", len(set(users_subset['userID'])), "users.")

We have a total of 386670 users.
In the subset for the evaluation we have a total number of 5000 users.


In [7]:
users_subset.head()

Unnamed: 0,userID,songID,play_count
226,3fd3acaa8dfeb94b0602a33085b44ebe80545dd2,SOBRZCG12A6702187D,1
227,3fd3acaa8dfeb94b0602a33085b44ebe80545dd2,SONQBUB12A6D4F8ED0,1
262,c231bc806c239b1322421e66fc001822a9b2c2f0,SOBEVGM12A67ADBCA7,1
296,76bcebcaf7b1f20c857bb8a23d0030b086cf292f,SOTEFFR12A8C144765,1
330,9b392166d01817895c03dc190f4eff58153a25e3,SOCHPTV12A6BD53113,2


## Content based recommendation

### Represent songs as vectors

We use the feature artist_terms and implement one-hot-encoding. With one-hot-encoding, we convert each categorical value into a new categorical column and assign a binary value 1 or 0 to each feature if the term was initially in the terms of a song.

In [8]:
df_songs = songs_cleaned

In [9]:
# First, extract the set of artist_terms

all_terms = []
for row in range(len(df_songs)):
    if df_songs['artist_terms'][row] == []:
        print('vacia')
    all_terms.append(df_songs['artist_terms'][row])

all_terms = np.concatenate(all_terms)
all_terms = list(set(all_terms)) # we have gotten a list of the set of all artist_terms

d = len(all_terms) # dimension of the vectors we are representing
print('Songs will be represented as binary vectors of dimension', d)

Songs will be represented as binary vectors of dimension 2294


In [10]:
# Now, we intend to create a binary vector (length = d) that represents a song, 
# with 1s if the song has this term and 0s if it hasn't.

def vectorize(song):

    index = int(df_songs.index[df_songs['song_id'] == song][0])
    vector = np.zeros(len(all_terms))

    for i in range(len(vector)):
        if all_terms[i] in df_songs['artist_terms'][index]:
            vector[i] = 1
            
    return vector

In [11]:
# Vectorize ALL the songs and save it into a dictionary

vector_representation = {}

for song in df_songs['song_id']:
    vector_representation[song] = vectorize(song)
    
#vector_representation # we end up with a dictionary of songs with their vector representation

### Get the user profiles

We will represent each user as a vector of dimension d. 
The representation is the weighted average of the songs that the user has already listened to.

In [12]:
# Compute all the results only for the subset of 5000 users
user_plays = users_subset

In [13]:
# Get the User profile: that will be a vector of dimension d computed as the weighted average of his played songs
# Get user profile for all users

def getUserProfile(user):

    song_counts = list(user_plays[user_plays['userID'] == user]['play_count'])
    song_list = list(user_plays[user_plays['userID'] == user]['songID'])


    # Check if the user songs are in the song dataset and get the indices of the songs
    indices = []
    for song in song_list:
        if song in list(df_songs['song_id']):
            indices.append(song_list.index(song))
        else:
            continue


    # Get the User profile (i.e. Compute the (weighted) average of the songs of a user)
    a = [song_counts[i] for i in indices]
    b = [vector_representation[song_list[i]] for i in indices]

    numerator = np.zeros(d)
    for i in range(len(a)):
        numerator = numerator + ( a[i] * np.asarray(b[i]) )

    user_profile = numerator / sum(a)
    return user_profile

In [14]:
# Get user profiles for every user in the users_plays dataset
# If users_plays = users_cleaned --> Very slow cell (it lasts 1 day). Set to True to run this cell. Or, read the already created dictionary in the file user_profiles.pkl
# Otherwise if users_plays = users_subset, (since it is a subset) OK.

create_user_profiles = True

if create_user_profiles:
    users = set(list(user_plays['userID']))
    user_profiles = {}
    for user in tqdm(users):
        user_profiles[user] = getUserProfile(user)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [15]:
# Save dictionary of user_profiles
# Set to True if you want to save the user_profiles dictionary as a separate file
if create_user_profiles:
    with open("user_profiles.pkl", "wb") as f:
        pickle.dump(user_profiles, f)

In [16]:
# Read user_profiles file and save it into a dictionary
if not create_user_profiles:
    with open("user_profiles.pkl", "rb") as a_file:
        user_profiles = pickle.load(a_file)
print("We have", len(user_profiles), "users in this dictionary.")

We have 5000 users in this dictionary.


### Get recommendation

We will recommend the songs of our dataset that are more similar to the user profile.

In [17]:
# Recommendations for all users

def getRecommendation(user):
    user_profile = user_profiles[user]
    
    scores_dict = {}
    for song in df_songs['song_id']:
        dist = np.linalg.norm(user_profile - vector_representation[song]) # euclidean distance
        scores_dict[song] = dist

    # R best recommendations
    R = 10
    recommended_songs = []
    for score in np.sort(list(scores_dict.values()))[0:R]:
        recommended_songs.append(list({i for i in scores_dict if scores_dict[i]==score}))

    recommended_songs = list(set(np.concatenate(recommended_songs)))

    return recommended_songs

In [18]:
# Create dictionary of user_recommendations
# Set to True if you want to create it again.
# If users_plays = users_cleaned --> Very slow cell (it takes forever)
# Otherwise if users_plays = users_subset, OK.
if True:
    user_recommendations = {}
    counter = 0
    for key in tqdm(user_profiles.keys()):
        if counter < 10000: # control parameter, if we have more than a certain nb of users, don't compute it all
            user_recommendations[key] = getRecommendation(key)
            counter += 1
        else:
            break

  0%|          | 0/5000 [00:00<?, ?it/s]

In [19]:
# Save dictionary of user_recommendations
# Set to True if you want to save the user_recommendations dictionary as a separate file
save_user_recommendations = True
if save_user_recommendations:
    with open("user_recommendations.pkl", "wb") as f:
        pickle.dump(user_recommendations, f)

In [20]:
# Read user_recommendations file and save it into a dictionary
if not save_user_recommendations:
    with open("user_recommendations.pkl", "rb") as a_file:
        user_recommendations = pickle.load(a_file)
    print("We have", len(user_recommendations), "users with their recommendations.")

### Evaluation of recommendation

We can use different methods:

#### Intra-list similarity

Intra-list similarity is the average cosine similarity of all items in a list of recommendations.
Intra-list similarity can be calculated for each user, and averaged over all users in the test set to get an estimate of intra-list similarity for the model.

In [21]:
# Cosine similarity
from numpy.linalg import norm
def cosine_similarity(song1, song2):
    a = vector_representation[song1]
    b = vector_representation[song2]
    dist = np.dot(a,b)/(norm(a)*norm(b))
    return dist

# Intra list similarity function
def intra_list_similarity(user):
    rec = user_recommendations[user]
    
    # All possible pairs in list
    pairs = [(a, b) for idx, a in enumerate(rec) for b in rec[idx + 1:]]
    
    # Compute the average distances between the pairs of the recommended songs
    distances = []
    for pair in pairs:
        distances.append(cosine_similarity(pair[0], pair[1]))
    
    return np.mean(distances)

In [22]:
# Get the intra-list similarity of the model
# Average of all intra list similarities

intra_list_similar = []
for user in user_recommendations.keys():
    intra_list_similar.append(intra_list_similarity(user))

print("The intra list similarity for Content based model is:" , np.mean(intra_list_similar))

The intra list similarity for Content based model is: 0.5209447271563011


#### Personalization (in progress)

https://towardsdatascience.com/evaluation-metrics-for-recommender-systems-df56c6611093

#### Evaluation that I explain in the whatsapp video (Ale)

In [23]:
# Compute the average of the distances of the recommended songs to the user profile 
# Distance can be calculated by euclidean or cosine similarity

def average_distance(user):
    rec = user_recommendations[user]
    user_profile = user_profiles[user]

    distances = []
    for song in rec:
        a = vector_representation[song]
        distances.append(np.linalg.norm(a-user_profile)) # euclidean distance
    return np.mean(distances)

In [24]:
# Get the average similarity of the model
# Average of all similarities

average_similar = []
for user in user_recommendations.keys():
    average_similar.append(average_distance(user))

print("The average similarity for Content based model is:" , np.mean(average_similar))

The average similarity for Content based model is: 3.120989606575042


# Item based collaborative filtering (Angeliki)

<sup>Inspired by https://github.com/csaluja/JupyterNotebooks-Medium/blob/master/CF%20Recommendation%20System-Examples.ipynb<sup>

Load libraries

In [25]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import pdist, squareform

Read-in the data and create the utility matrix

In [26]:
# create the utility matrix
utility_matrix = users_cleaned.pivot(index='userID', columns='songID', values='play_count')

# Includes 386670 users and 3195 songs
utility_matrix.shape 

(386670, 3195)

In [27]:
# Narrow down to 10k users and replace the NaN with 0s.

df1 = utility_matrix[utility_matrix.index.isin(list(users_subset['userID']))] # the 5000 users subset
df2 = utility_matrix[~utility_matrix.index.isin(list(users_subset['userID']))][:5000] # other 5000 subset
u1 = pd.concat([df1, df2])
np.nan_to_num(u1,copy=False)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
# users and songs to experiment with
user = '06b4caaf4dcc2476b5ac096f08f4356b6ba9a86a'
#user = '00038cf792e9f9a1cb593dea5779f96195aac68c'
#user = '0002b896949cb2899feaed47104406e99eafa983'
song = 'SOAPNML12A8C13B696'
#song = 'SOSHUVD12A6701F8F9'

samplelist = list(users_subset['userID'][:10])

**I am implementing item based collaborative filtering as it outperforms user based and items are simpler than user tastes**

### Recommend 10 songs to a list of users using adjusted cosine correlation

Difference between Pearson's correlation and adjusted cosine correlation:

   - In pearson correlation, the mean which subtracted is about the particular item itself (ratings from all users), mean(Ri)
   - In adjusted cosine correlation, the mean is about the particular user (ratings to all items), mean(Ru)



In [29]:
# This function computes a adjusted cosine correlation matrix from a utility matrix
def get_adj_cosine_M(utility_matrix):
    M = utility_matrix.to_numpy()
    M_u = M.mean(axis=1)
    item_mean_subtracted = M - M_u[:, None]
    similarity_matrix = 1 - squareform(pdist(item_mean_subtracted.T, 'cosine'))
    
    return pd.DataFrame(similarity_matrix, index=utility_matrix.columns, columns=utility_matrix.columns)

In [30]:
adjcos_sim = get_adj_cosine_M(u1)
adjcos_sim.head()

songID,SOAAAQN12AB01856D3,SOAANKE12A8C13CF5C,SOAASSD12AB0181AA6,SOABLAF12AB018E1D9,SOABRXK12A8C130A36,SOABTKM12A8AE4721E,SOABVPU12AB018AA22,SOABVWD12A58A7C3FF,SOACEDS12A6701EAAA,SOACFRH12A8C13E183,...,SOZWCKB12AB0186C5B,SOZWECJ12A6D4F5229,SOZWVCA12A6D4F9774,SOZXHBQ12AB0186626,SOZXTKD12A8C13FC43,SOZYPNV12A6701E3B8,SOZYZDZ12AB01873CA,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZVMW12AB0183B52
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SOAAAQN12AB01856D3,1.0,0.048477,0.048477,-0.001083,0.003297,-0.001342,0.048477,0.048477,0.001969,0.048477,...,0.004875,0.004948,0.014631,0.014579,0.048477,0.048477,0.048477,4.6e-05,0.048477,0.00172
SOAANKE12A8C13CF5C,0.048477,1.0,1.0,-0.013425,0.069061,-0.019706,1.0,1.0,0.045263,1.0,...,0.101682,0.105709,0.302511,0.301665,1.0,1.0,1.0,0.009334,1.0,0.03967
SOAASSD12AB0181AA6,0.048477,1.0,1.0,-0.013425,0.069061,-0.019706,1.0,1.0,0.045263,1.0,...,0.101682,0.105709,0.302511,0.301665,1.0,1.0,1.0,0.009334,1.0,0.03967
SOABLAF12AB018E1D9,-0.001083,-0.013425,-0.013425,1.0,-0.00108,-0.000901,-0.013425,-0.013425,-0.001284,-0.013425,...,-0.001529,-0.001949,-0.004162,-0.004184,-0.013425,-0.013425,-0.013425,-0.001348,-0.013425,-0.001143
SOABRXK12A8C130A36,0.003297,0.069061,0.069061,-0.00108,1.0,-0.001498,0.069061,0.069061,0.003046,0.069061,...,0.007003,0.007238,0.02088,0.020817,0.069061,0.069061,0.069061,0.000501,0.069061,0.002668


In [31]:
# This function finds k similar songs given songID and adjusted cosine matrix
def get_similar_songs_adjcosine(songID, adj_sim_m , k):
    '''Find k similar songs given songID and adjusted cosine matrix '''
    
    # sort the similarities and grab k highest values
    similarities = adj_sim_m [songID].sort_values(ascending=False)[:k+1].values
    # grab the songIDs
    indices = adj_sim_m [songID].sort_values(ascending=False)[:k+1].index
    
    print('{} most similar items for item {}:\n'.format(k,songID))
    for i in range(0, len(indices)):
            #first index is songID by default
            if indices[i] == songID:
                continue;

            else:
                print('{}: Song {} , with similarity of {}'.format(i,indices[i], similarities[i]))
        
    return similarities ,indices.to_list()

In [32]:
#This function recommends 10 songs based on item-item collaborative filtering
# given a list of users and a utility matrix (database)
def recommend10Items(user_list, database):
    result = dict()
    # compute adjusted cosine similarity matrix
    sim_matrix = get_adj_cosine_M(database)
    
    for userID in user_list:
        # find row corresponding to user in database
        idx = database.index.get_loc(userID)
        #get the top 5 songs that he already likes
        likes = database.iloc[idx, np.argsort(-database.values[idx])[:5]].index
        
        for songID in likes:
            similarities, recommendations = get_similar_songs_adjcosine(songID, sim_matrix, 1)
            
            if userID not in result:
                result[userID] = recommendations
            elif type(result[userID]) == list:
                result[userID].append(recommendations)
            else:
                result[userID] = [result[userID], recommendations]
    
    return result

In [33]:
get_recom = recommend10Items(samplelist, u1)

# flatten the recommendation list
from pandas.core.common import flatten

for user, rec in get_recom.items():
    get_recom[user] = list(flatten(rec))

user_recommendations = get_recom
user_recommendations

1 most similar items for item SOBRZCG12A6702187D:

1: Song SOZQSGL12AF72A9145 , with similarity of 0.28970648489491746
1 most similar items for item SONQBUB12A6D4F8ED0:

1: Song SOCHRXB12A8AE48069 , with similarity of 0.25577003422799693
1 most similar items for item SOAAAQN12AB01856D3:

1: Song SOYJNHO12AB01856DC , with similarity of 0.5735637332552541
1 most similar items for item SOQJPYF12AF72AA8E2:

1: Song SOHTWLT12A8C13CFE1 , with similarity of 0.13713815953117536
1 most similar items for item SOQJWZI12A8C140181:

0: Song SOLTAOU12A8C1375CB , with similarity of 1.0
1: Song SOLEIOS12AB018372B , with similarity of 1.0
1 most similar items for item SOBRZCG12A6702187D:

1: Song SOZQSGL12AF72A9145 , with similarity of 0.28970648489491746
1 most similar items for item SONQBUB12A6D4F8ED0:

1: Song SOCHRXB12A8AE48069 , with similarity of 0.25577003422799693
1 most similar items for item SOAAAQN12AB01856D3:

1: Song SOYJNHO12AB01856DC , with similarity of 0.5735637332552541
1 most similar

{'3fd3acaa8dfeb94b0602a33085b44ebe80545dd2': ['SOBRZCG12A6702187D',
  'SOZQSGL12AF72A9145',
  'SONQBUB12A6D4F8ED0',
  'SOCHRXB12A8AE48069',
  'SOAAAQN12AB01856D3',
  'SOYJNHO12AB01856DC',
  'SOQJPYF12AF72AA8E2',
  'SOHTWLT12A8C13CFE1',
  'SOLTAOU12A8C1375CB',
  'SOLEIOS12AB018372B',
  'SOBRZCG12A6702187D',
  'SOZQSGL12AF72A9145',
  'SONQBUB12A6D4F8ED0',
  'SOCHRXB12A8AE48069',
  'SOAAAQN12AB01856D3',
  'SOYJNHO12AB01856DC',
  'SOQJPYF12AF72AA8E2',
  'SOHTWLT12A8C13CFE1',
  'SOLTAOU12A8C1375CB',
  'SOLEIOS12AB018372B'],
 'c231bc806c239b1322421e66fc001822a9b2c2f0': ['SOBEVGM12A67ADBCA7',
  'SOHPAVE12A8AE47190',
  'SOAAAQN12AB01856D3',
  'SOYJNHO12AB01856DC',
  'SOQJLFV12AB01897C7',
  'SOLTAOU12A8C1375CB',
  'SOQJPYF12AF72AA8E2',
  'SOHTWLT12A8C13CFE1',
  'SOLTAOU12A8C1375CB',
  'SOLEIOS12AB018372B'],
 '76bcebcaf7b1f20c857bb8a23d0030b086cf292f': ['SOTEFFR12A8C144765',
  'SOREYUK12A58A7A253',
  'SOAAAQN12AB01856D3',
  'SOYJNHO12AB01856DC',
  'SOQJLFV12AB01897C7',
  'SOLTAOU12A8C1375CB',
  

#### Intra-list similarity

Intra-list similarity is the average cosine similarity of all items in a list of recommendations.
Intra-list similarity can be calculated for each user, and averaged over all users in the test set to get an estimate of intra-list similarity for the model.

In [34]:
# Get the intra-list similarity of the model
# Average of all intra list similarities

intra_list_similar = []
for user in user_recommendations.keys():
    intra_list_similar.append(intra_list_similarity(user))

print("The intra list similarity for Item based collaborative filtering model is:" ,np.mean(intra_list_similar))

The intra list similarity for Item based collaborative filtering model is: 0.21325619488889447


#### Evaluation that I explain in the whatsapp video (Ale)

In [35]:
# Get the average similarity of the model
# Average of all similarities

average_similar = []
for user in user_recommendations.keys():
    average_similar.append(average_distance(user))

print("The average similarity for Item based collaborative filtering model is:" , np.mean(average_similar))

The average similarity for Item based collaborative filtering model is: 5.660672626625526


### Recommend 10 songs to a list of users by implementig kNN search for item based filtering

In [36]:
def get_similar_songs_kNN(songID, user_profiles, similarity_metric , k):
    '''Find k most similar songs to a given songID'''
    similarity = list()
    neigh_ind = list()
    song_profiles=user_profiles.T
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(song_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(song_profiles.loc[songID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    
    similar_songs = []
    for i in range(0,len(neigh_ind.flatten())):
        if song_profiles.index[neigh_ind.flatten()[i]] == songID:
            continue;
        else:
            similar_songs.append(song_profiles.index[neigh_ind.flatten()[i]])
            
    return similar_songs

In [37]:
def recommend10Items_kNN(user_list, database):
    result = dict()
    
    for userID in user_list:
        # find row corresponding to user in database
        idx = database.index.get_loc(userID)
        #get the top 5 songs that he already likes
        likes = database.iloc[idx, np.argsort(-database.values[idx])[:5]].index
        
        for songID in likes:
            simsongs= get_similar_songs_kNN(songID, database, 'cosine', 2)
            
            if userID not in result:
                result[userID] = simsongs
            elif type(result[userID]) == list:
                result[userID].append(simsongs)
            else:
                result[userID] = [result[userID], simsongs]
    
    return result

In [38]:
get_recom_kNN = recommend10Items_kNN(samplelist, u1)
get_recom_kNN

# flatten the recommendation list
for user, rec in get_recom_kNN.items():
    get_recom_kNN[user] = list(flatten(rec))

user_recommendations = get_recom_kNN
user_recommendations

{'3fd3acaa8dfeb94b0602a33085b44ebe80545dd2': ['SOZQSGL12AF72A9145',
  'SOBSEGK12A58A7BEBF',
  'SOCHRXB12A8AE48069',
  'SOIHJSD12A6701EB04',
  'SOYJNHO12AB01856DC',
  'SOSPNDI12AB017F769',
  'SOHTWLT12A8C13CFE1',
  'SOQFUXL12A8C136D6B',
  'SOAAAQN12AB01856D3',
  'SOAANKE12A8C13CF5C',
  'SOAASSD12AB0181AA6',
  'SOZQSGL12AF72A9145',
  'SOBSEGK12A58A7BEBF',
  'SOCHRXB12A8AE48069',
  'SOIHJSD12A6701EB04',
  'SOYJNHO12AB01856DC',
  'SOSPNDI12AB017F769',
  'SOHTWLT12A8C13CFE1',
  'SOQFUXL12A8C136D6B',
  'SOAAAQN12AB01856D3',
  'SOAANKE12A8C13CF5C',
  'SOAASSD12AB0181AA6'],
 'c231bc806c239b1322421e66fc001822a9b2c2f0': ['SOHPAVE12A8AE47190',
  'SOKXYUW12A8C140229',
  'SOYJNHO12AB01856DC',
  'SOSPNDI12AB017F769',
  'SOAANKE12A8C13CF5C',
  'SOAASSD12AB0181AA6',
  'SOHTWLT12A8C13CFE1',
  'SOQFUXL12A8C136D6B',
  'SOAAAQN12AB01856D3',
  'SOAANKE12A8C13CF5C',
  'SOAASSD12AB0181AA6'],
 '76bcebcaf7b1f20c857bb8a23d0030b086cf292f': ['SOREYUK12A58A7A253',
  'SOTPQFM12AB017AC9E',
  'SOYJNHO12AB01856DC',
  

#### Intra-list similarity

Intra-list similarity is the average cosine similarity of all items in a list of recommendations.
Intra-list similarity can be calculated for each user, and averaged over all users in the test set to get an estimate of intra-list similarity for the model.

In [39]:
# Get the intra-list similarity of the model
# Average of all intra list similarities

intra_list_similar = []
for user in user_recommendations.keys():
    intra_list_similar.append(intra_list_similarity(user))

print("The intra list similarity for Item based collaborative filtering by implementing kNN model is:" ,np.mean(intra_list_similar))

The intra list similarity for Item based collaborative filtering by implementing kNN model is: 0.13883288337015995


#### Evaluation that I explain in the whatsapp video (Ale)

In [40]:
# Get the average similarity of the model
# Average of all similarities

average_similar = []
for user in user_recommendations.keys():
    average_similar.append(average_distance(user))

print("The average similarity for Item based collaborative filtering by implementing kNN model is:" , np.mean(average_similar))

The average similarity for Item based collaborative filtering by implementing kNN model is: 6.164115742941303


# User-user Collaborative filtering

The original file contains
- 1,019,318 unique users
- 48,373,586 user-song.play count triplets

A subset of 50000 triplets can be found in triplets_50000.txt, where each line is in the format:
    
    userID \tab songID \tab play_count

Read in the data:

In [None]:
user_profiles = pd.read_csv('data/triplets_1000.txt', sep='\t', names = ['userID','songID', 'play_count'])

The problem: the original dataset of triplets is too large to be converted in this way.
Possible solutions:
1. dtype optimization
2. Split data into chunks

In [None]:
user_profiles

Pivot to tranform the data from long to wide:

In [None]:
user_profiles = user_profiles.pivot(index='userID', columns='songID', values='play_count')
user_profiles

Drop the columns where all elements are NaN

In [None]:
user_profiles = user_profiles.dropna(axis=1, how='all') #doesn't make sense, a song will only exist if a user has listend to it
user_profiles

In [None]:
# Replace the NaN with 0s.
user_profiles = user_profiles.fillna(0)

In [None]:
#save it as a csv (do it only once)
#user_profiles.to_csv(path_or_buf= 'user_profile_from_50000_triplets.csv')

Get **cosine similarity** for play counts between users

In [None]:
# pairwise_distances is the distance between counts, thus 1 - pairwise_distances is the similarity between counts
cosine_sim = 1-pairwise_distances(user_profiles , metric="cosine")

In [None]:
# Calculate the cosine similarity matrix for the users
M_cosine = pd.DataFrame(cosine_sim)
M_cosine

Get **pearson similarity** for all users

In [None]:
pearson_sim = 1-pairwise_distances(user_profiles, metric="correlation")
M_pearson = pd.DataFrame(pearson_sim)
M_pearson

Same for euclidean and hamming :

In [None]:
euclidean_sim = 1-pairwise_distances(user_profiles, metric="euclidean")
M_euclidean = pd.DataFrame(euclidean_sim)

hamming_sim = 1-pairwise_distances(user_profiles, metric="hamming")
M_hamming = pd.DataFrame(hamming_sim)

## Find k similar users to a given user

A function that finds k similar users given userID and the user_profiles matrix

In [None]:
def get_similarusers(userID, user_profiles, similarity_metric , k):
    '''Find k most similar users to a given userID'''
    similarity = list()
    neigh_ind = list()
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(user_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(user_profiles.loc[userID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar users to user {}, using {} similarity:\n'.format(k, userID, similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        if user_profiles.index[neigh_ind.flatten()[i]] == userID:
            continue;
        else:
            print('{}: User {}, with similarity of {}'.format(i, user_profiles.index[neigh_ind.flatten()[i]], similarity.flatten()[i]))
            
    return similarity,neigh_ind

In [None]:
# Using cosine similarity
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'cosine', k = 4)

In [None]:
# Using correlation similarity
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'correlation', k = 4)

## Predict play count for a user-song combination based on user-user

In [None]:
def predict_play_count_uu(userID, songID, user_profiles, similarity_metric, k):
    '''Predict play count for a particular user-song tuple, based on user-to-user similarity. Use with cosine similarity.'''
    prediction = 0
    similarity, indices = get_similarusers(userID, user_profiles, similarity_metric, k) #similar users based on cosine similarity
    # get mean play count for a user, to adjust

    mean_play_count = user_profiles.loc[userID, :].mean() 
    # weight_i is the similarity of neigbhor_i to user X
    sum_of_similarity = np.sum(similarity) - 1 # -1 because user 1 is included, has a similarity of 1
    
    # initializing variables
    w_similarity = 1
    weighted_sum = 0
    
    for i in range(0, len(indices.flatten())):
        if user_profiles.index[indices.flatten()[i]] == userID:
            continue;
        else:
            # Normalize ratings for a given user by subtracting row mean (centered cosine, or pearson cor)
            try:
                songidx = user_profiles.columns.get_loc(songID)
            except KeyError:
                print(f'Warning: song {songID} not found for user {user_profiles.index[indices.flatten()[i]]}')
                continue
            try:
                person = indices.flatten()[i]
            except KeyError:
                print(f'Warning: user {person} not found')
                continue
            play_count_dif = user_profiles.iloc[person, songidx] - np.mean(user_profiles.iloc[indices.flatten()[i],:])
               
            w_similarity = play_count_dif*similarity[i]
            weighted_sum += w_similarity
            
    prediction = mean_play_count + (weighted_sum/sum_of_similarity)
    print('Predicted rating for user {} -> song {}: {}'.format(userID, songID, prediction))

    return prediction

In [None]:
predict_play_count_uu('5a905f000fc1ff3df7ca807d57edb608863db05d', 'SOZZYAO12A6701FF36', user_profiles, 'cosine', 4 )

In [None]:
# filter out low play songs to get better results?
sum_col = user_profiles.sum(axis = 0)

In [None]:
print(max(sum_col))
print(min(sum_col))
print(statistics.median(sum_col))
print(statistics.mean(sum_col))

In [None]:
# how many columns have a sum of 2 or less play counts
ignore_indexes, count= [], 0
for i, value in enumerate(sum_col):
    if value <= 2:
        count += 1
        ignore_indexes.append(i)
print(count)
print(len(ignore_indexes))

In [None]:
user_profiles.shape

In [None]:
# drop columns based on index list created above
df2 = user_profiles.drop(user_profiles.iloc[:, ignore_indexes], axis = 1)

In [None]:
df2.shape

In [None]:
27898 - 16601

In [None]:
df2

In [None]:

# now repeat prediction for user ff4322e94814d3c7895d07e6f94139b092862611 and song SOAADCB12A81C22AFA
predict_play_count_uu('b80344d063b5ccb3212f76538f3d9e43d87dca9e', 'SOAADCB12A81C22AFA', df2, 'cosine', 10 )


# Item-Item collaborative filtering

Read in the data:

In [None]:
# load sparse preprocessed pandas dataframe if available 
if os.path.exists('data/df.pkl'): 
    with open('data/df.pkl', 'rb') as f:
        user_profiles = pickle.load(f)

In [None]:
if not 'user_profiles' in globals():
    frame = pd.read_csv('triplets_1000.txt', sep='\t', names = ['userID','songID', 'play_count'])

In [None]:
if not 'user_profiles' in globals():
    person_u = list(frame.userID.unique())
    thing_u = list(frame.songID.unique())

    data = frame['play_count'].tolist()
    row = frame.userID.astype('category').cat.codes
    col = frame.songID.astype('category').cat.codes
    sparse_matrix = csr_matrix((data, (row, col)), shape=(len(person_u), len(thing_u)))
    user_profiles = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=person_u, columns=thing_u)
user_profiles

In [None]:
if not os.path.exists('data/df.pkl'): 
    with open('data/df.pkl', 'wb') as f:
        pickle.dump(user_profiles, f)

In [None]:
# http://millionsongdataset.com/sites/default/files/AdditionalFiles/unique_tracks.txt
songs = pd.read_csv('data/unique_tracks.txt' ,sep='<SEP>', names=['track_id',  'song_id',  'artist_name', 'song_title'], engine='python') 
songs.head()

In [None]:
id_to_song_name = defaultdict(lambda : 'NA', zip(songs.song_id, songs.song_title))
unique_songs = user_profiles.columns
names = []
for song in unique_songs:
    names.append(id_to_song_name[song]) 
# check number of missing songs
a = np.array(names)
a[a == 'NA'].size

In [None]:
song_profiles = user_profiles.T
# song_profiles = song_profiles.fillna(0)
song_profiles = song_profiles#.sparse.to_dense()
song_profiles

In [None]:
def get_similar_songs(songID, song_profiles, similarity_metric, k):
    '''Find k most similar users to a given userID'''
    similarity = list()
    neigh_ind = list()
    
    knn = NearestNeighbors(metric = similarity_metric, algorithm = 'brute')
    knn.fit(song_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(song_profiles.loc[songID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar song to song {}, using {} similarity:\n'.format(k, id_to_song_name[songID], similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        song_id = song_profiles.index[neigh_ind.flatten()[i]]
        if song_id == songID:
            continue;
        else:
            print('{}: song {}, with similarity of {}'.format(i, id_to_song_name[song_id], similarity.flatten()[i]))
            
    return similarity, neigh_ind

In [None]:
song = 'SOAARXR12A8C133D15'
print('song name: ', id_to_song_name[song])
similarities,indices = get_similar_songs(song, song_profiles, similarity_metric = 'correlation', k = 15)

In [None]:
def random_recommendation(song, data, n):
    '''Randomly recommend n songs to a user'''
    # get a list of all the songs
    all_songs = np.array(data.index)
    # randomly sample n songs
    random_song_ids = np.random.randint(0, len(all_songs), n)
    rec_songs = all_songs[random_song_ids]
    query_songidx = np.where(data.index == song)[0][0]
    sims = []
    for i in range(len(rec_songs)):
        # maybe a problem that it uses the cosine similarity, but pearson is not implemented for paired distances in this way
        sim = 1-paired_distances(np.array(data.iloc[query_songidx,:]).reshape(1, -1), np.array(data.iloc[random_song_ids[i],:]).reshape(1, -1), method='cosine')
        sims.append(sim)
    song_id = song_profiles.index[random_song_ids.flatten()]
    rec_songs = [(id_to_song_name[song_id[i]], sims[i][0]) for i in range(len(sims))]
    return rec_songs
song = 'SOAARXR12A8C133D15'
print('query',id_to_song_name[song])
random_recommendation(song, song_profiles, 10)

# Locality Sensitive hashing in Collaborative item-item filtering (inspired by week5 : SimilarItems)

Explain:
- LSH vs KNN, and why LSH is more efficient
- Present a scheme of how LSH algorithm operates
- Why we use Jaccard distance here (and how we make the decision for binary outcome, "likes":1/"dislikes":0 )

In [None]:
################################ Alejandra's ############################################
if not os.path.exists('data/MillionSongSubset.pkl'):
    path = 'MillionSongSubset'
    songs_list = []

    for (root, dirs, file) in os.walk(path):
        for f in file:
            songs_list.append(os.path.dirname(f))
    ## READ DATA PATH FROM FILE
    songs_file_paths = []

    for root, dirs, files in os.walk(os.path.abspath(path)):
        for file in files:        
            strs = os.path.join(root, file)
            new_strs = strs.replace('\\','/')
            songs_file_paths.append(new_strs)
            
    ### CREATE PANDAS TABLE

    N = len(songs_file_paths)
    data = []

    for i in tqdm(range(N)):
        record = []
        
        # Open specific song path
        h5 = hdf5_getters.open_h5_file_read(songs_file_paths[i])
        artist_id = hdf5_getters.get_artist_id(h5)
        artist_id = artist_id.decode("utf-8")   
        
        song_id = hdf5_getters.get_song_id(h5)
        song_id = song_id.decode("utf-8")
        
        song_name = hdf5_getters.get_title(h5)
        song_name = song_name.decode("utf-8")
        artist_terms_ = hdf5_getters.get_artist_terms(h5)
        artist_terms = []
        for j in range(len(artist_terms_)):
            artist_terms.append(artist_terms_[j].decode("utf-8"))
        
        # Close file
        h5.close()
        record.append(artist_id)
        record.append(song_id)
        record.append(song_name)
        record.append(artist_terms)
        data.append(record)
        
    df = pd.DataFrame(data, columns=['artist_id','song_id','song_name','artist_terms'])

In [None]:
if not os.path.exists('data/MillionSongSubset.pkl'):
    with open('data/MillionSongSubset.pkl', 'wb') as f:
        pickle.dump(df, f)
else:
    with open('data/MillionSongSubset.pkl', 'rb') as f:
        df = pickle.load(f)

In [None]:
df.loc[df['song_id'] == 'SOOWVHQ12A8AE476A1']['song_name']

Read in the data:
    
    userID \tab songID \tab play_count

In [None]:
u_matrix = pd.read_csv('data/triplets_50000.txt', sep='\t', names = ['userID','songID', 'play_count'])

Pivot to tranform the data from long to wide:

In [None]:
u_matrix = u_matrix.pivot(index="userID", columns="songID", values="play_count")

In [None]:
u_matrix[356:370]

In [None]:
# u_m = u_matrix.fillna(0) # Replace the NaN with 0s.

In [None]:
total_count_list = u_matrix.sum(axis=1, skipna=True)

In [None]:
print("The greatest listener's total play count is", np.max(total_count_list))
print("The lowest total play count of a user is", np.min(total_count_list))

### Way num 1: normalize users and tranform to 0, 1

So it's a good idea, before we start, to normalize for "big" and "low" listeners:

In [None]:
u_normalized = u_matrix.sub(u_matrix.mean(axis=1, skipna=True), axis=0) # substract from each cell the row mean

In [None]:
u_normalized.shape

In [None]:
u_normalized.head()

Lets use the rule:

    if the norm play count is < 0 == the user didn't like the song
    if the norm play count is > 0 == the user likes the song

In [None]:
# u_binary = np.where(u_normalized[u_normalized.columns] < 0, 0, 1)

In [None]:
df_ = u_normalized.copy()

In [None]:
df_[df_ >= 0] = 1
df_[df_ < 0] = 0

In [None]:
df_.shape

In [None]:
df_ = df_.T
df_

In [None]:
# Transform the data for the LSH algo
start_time = time.time()
cols = df_.columns.to_numpy() # the users 
vectors_list = [cols[x].tolist() for x in df_.eq(1).to_numpy()] # each vector is a song, contains users that liked that song
print('It took %s seconds.' %(time.time()-start_time))

In [None]:
# sanity check
len(vectors_list) # should be equal to the 27898 columns (songs)

In [None]:
df_new = pd.DataFrame(vectors_list, index = df_.index)
# df_new.shape # (27898, 49)


In [None]:
# drop rows with none
df_new_reduced = df_new.mask(df_new.eq('None')).dropna(how = 'all') 

In [None]:
df_new_reduced['users'] = df_new_reduced[df_new_reduced.columns].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
df_new_reduced

In [None]:
data = df_new_reduced['users'].copy()

In [None]:
data = pd.DataFrame(data)

In [None]:
data.head()

Choose parameters:

In [None]:
#Number of Permutations
permutations = 128

#Number of Recommendations to return
num_recommendations = 1

Create MiniHash forest

In [None]:
#Preprocess will split a string of text into individual tokens/shingles based on ",".
def preprocess(text):
    tokens = text.split()
    return tokens


In [None]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for users in data['users']:
        tokens = preprocess(users) # list of users
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest


In [None]:
forest = get_forest(data, permutations)

In [None]:
def predict(song_profile, database, perms, num_results, forest): # song_profile in list form
    start_time = time.time()
    
    m = MinHash(num_perm=perms)
    for users in song_profile:
        m.update(users.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['users']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result


In [None]:
num_recommendations = 20
song_profile = [ '5d5e0142e54c3bb7b69f548c2ee55066c90700eb'] # i made this random profile of an imaginary user manually, make it work with songID
result = predict(song_profile, data, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)


In [None]:
df

In [None]:

for song in result.index:
    print(df.loc[df['song_id'] == song]['song_name'])

## Way num. 2 pick for each song the top n users 

In [None]:
n = 10
t3 = u_matrix.T.apply(lambda x: pd.Series(x.nlargest(n).index), axis=1)

In [None]:
print(u_matrix.T.shape,'\n', t3.shape)

In [None]:
t3.head(10)

In [None]:
# Tranform in the prefered format for the function 
c = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
t3['users'] = t3.iloc[:, :].apply(",".join, axis=1)

In [None]:
t3.head()

In [None]:
data2 = t3.drop(t3.columns[range(0,10)], axis=1)

In [None]:
data2

Now we are ready to use the LSH function

In [None]:
forest2 = get_forest(data2, permutations)

In [None]:
num_recommendations = 10
user_profile = [ '00498f4bab2bfeb17680113c7d9525ad5b0ad401'] # i made this random profile of an imaginary user manually, make it work with songID
result2 = predict(song_profile, data2, permutations, num_recommendations, forest2)
print('\n Top Recommendation(s) is(are) \n', result2)

# Apriori algorithm based recommendation system


https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/#apriori-frequent-itemsets-via-the-apriori-algorithm

https://rasbt.github.io/mlxtend/

Week 6 exercises

In [None]:
#df = pd.DataFrame(data, columns=['artist_id', 'artist_name', 'artist_location', 'song_id', 'song_name', 'song_hottness','time_signature','artist_terms','artist_mbtags','mode','year','latitude','longitude'])
with open('data/MillionSongSubset.pkl', 'rb') as f:
    df = pickle.load(f)
df

In [None]:
df1 = df[df.duplicated(['artist_id'], keep=False)]
df1 

In [None]:
baskets = df1['artist_terms'].tolist()
items = set(list(np.concatenate(baskets).flat))
len(items)

In [None]:
### hash all singletons
df_item_hash = pd.DataFrame(range(len(items)), index = list(items), columns =['hashcode'], dtype=int)
df_item_hash

In [None]:
### count the items, store the count into the hashed array index
item_count_arr = np.zeros((len(baskets),1))

for b in baskets:
    for item in b:
            idx = df_item_hash.loc[item,'hashcode']
            item_count_arr[idx] += 1
            
### find frequent items with support > s1 (here s1 = 0.02), and hash back from array index to items           
freq_items = [df_item_hash[df_item_hash['hashcode']==x].index[0] for x in np.where(item_count_arr > 0.02*len(baskets))[0]] 
print(len(freq_items))

In [None]:
df_freq_item_hash = pd.DataFrame(range(1,len(freq_items)+1), index=freq_items, columns=['hashcode'])
df_freq_item_hash

In [None]:
pair_mat_hashed = np.zeros((len(freq_items)+1,len(freq_items)+1))

In [None]:
for b in baskets:
    cand_list = [item for item in b if item in freq_items]
    if len(cand_list)<2:
        continue
    for idx, item1 in enumerate(cand_list):
        for item2 in cand_list[idx+1:]:
            i = df_freq_item_hash.loc[item1,'hashcode'] 
            j = df_freq_item_hash.loc[item2,'hashcode'] 
            pair_mat_hashed[max(i,j),min(i,j)]+=1

# pair_mat
pair_mat_hashed

In [None]:
freq_pairs = [[df_freq_item_hash[df_freq_item_hash['hashcode']==x].index[0], df_freq_item_hash[df_freq_item_hash['hashcode']==y].index[0]] for x, y in zip(*np.where(pair_mat_hashed > 0.02*len(baskets)))]
freq_pairs[0:10]

In [None]:
te = TransactionEncoder()
te_ary = te.fit(baskets).transform(baskets)
df2 = pd.DataFrame(te_ary, columns=te.columns_)
df2

In [None]:
apr = apriori(df2, min_support=0.05, use_colnames=True)

In [None]:
apr['length'] = apr['itemsets'].apply(lambda x: len(x))
apr

In [None]:
ml_freq_pairs = []
for i in apr[apr['length']==5].itemsets.values:
    ml_freq_pairs.append(list(i))

print(len(ml_freq_pairs))

In [None]:
artist_terms_df = df['artist_terms']
artist_terms_df

In [None]:
### GET USERS TASTE
#triples

user_plays = pd.read_csv('data/triplets_50000.txt', sep='\t', names = ['userID','songID', 'play_count'])

In [None]:
user_plays

In [None]:
all_user_songs = user_plays['songID']
myset_user = set(all_user_songs)
print(len(myset_user))

In [None]:
songs_set = set(df1['song_id'])
z = myset_user.intersection(songs_set)
print(len(z))

In [None]:
songs_cleaned = df.loc[df['song_id'].isin(z)]

In [None]:
if not os.path.exists('data/users_cleaned.zip'):
    compression_opts = dict(method='zip', archive_name='out.csv')  
    users_cleaned.to_csv('data/users_cleaned.zip', index=False, compression=compression_opts)  
    songs_cleaned.to_csv('data/songs_cleaned.zip', index=False, compression=compression_opts)  

In [None]:
def getUserSongTags(userID):
    one_user_data = users_cleaned.loc[users_cleaned['userID'] == userID]
    songs = one_user_data['songID']
    tag_list = []
    print(one_user_data)
    
    for song in songs:
        song_data = songs_cleaned.loc[songs_cleaned['song_id'] == song]
        tags = song_data['artist_terms']
        tag_list.append(list(tags))
    
    return tag_list
    

In [None]:
def getUserSongTagsMULTIPLIED(userID):
    one_user_data = users_cleaned.loc[users_cleaned['userID'] == userID]
    songs = one_user_data['songID']
    tag_list = []
    
    
    for song in songs:
        song_data = songs_cleaned.loc[songs_cleaned['song_id'] == song]
        a = one_user_data.loc[one_user_data['songID'] == song]
        song_play_count = a['play_count']
        tags = song_data['artist_terms']
        #print(song_play_count)
        ints = int(song_play_count)
        #print('AFTER : ', ints)
        for i in range(0, (ints+1)):
            tag_list.append(list(tags))
            #print(i)
        
        #print(tags)
        #print(song_play_count)
    
    return tag_list
    

In [None]:
user_tags = getUserSongTagsMULTIPLIED('8305c896f42308824da7d4386f4b9ee584281412')

#print((user_tags))

#print(user_tags)

In [None]:
def listToString(s):
    # initialize an empty string
    str1 = " "
   
    # return string 
    sj = (str1.join(s))
    return sj

In [None]:
def getTFIDFuser(userID):
    user_tags = getUserSongTagsMULTIPLIED(userID)
    #print(user_tags)
    
    tf_dc = {}
    other_documents = []
    porter = nltk.PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    doc = []
    
    for i in range(len(user_tags)):
        var = user_tags[i][0]
        #print(var)
        doc.append(listToString(var))
        #print(doc)

    doc = listToString(doc)
    other_documents.append(doc)
    tokens = tokenizer.tokenize(doc)
    tokens = [w.lower() for w in tokens]
    # tokens = [porter.stem(w) for w in tokens]

    for token in tokens:
        if token in tf_dc:
            tf_dc[token] += 1
        else:
            tf_dc[token] = 1
            
    tf_dc = dict(sorted(tf_dc.items(), key=lambda item: item[1], reverse=True))
    # We have only two documents
    N = (len(user_tags)) 
    idf_dc = {}

    for word in tf_dc.keys():
        n = 0
        for doc in other_documents:
            if word in doc:
                n += 1

        idf_dc[word] = math.log(N / (n + 1)) + 1
        
    tf_idf_dc = {}
    for word in tf_dc.keys():
        tf_idf_dc[word] = tf_dc[word] * idf_dc[word]
    
    return tf_idf_dc

In [None]:
######## This function is missing??????
user_tf_if = getTFIDFuser('b80344d063b5ccb3212f76538f3d9e43d87dca9e')
print(user_tf_if)

for genre, freq in user_tf_if.items():
    print(genre, ' : ', freq)

In [None]:
first2pairs = {k: user_tf_if[k] for k in list(user_tf_if)[:5]}
print(first2pairs)

In [None]:
plt.figure(figsize = (26, 8), facecolor = None)
plt.bar(user_tf_if.keys(), user_tf_if.values(), 1, color='g')


In [None]:
user_tfidf = getTFIDFuser('b80344d063b5ccb3212f76538f3d9e43d87dca9e')


first2pairs = {k: user_tfidf[k] for k in list(user_tfidf)[:7]}
user_favourite_tags = []


for genre, weight in first2pairs.items():
    print(genre, ' : ',weight)
    user_favourite_tags.append(genre)

print(user_favourite_tags)

In [None]:
item_tags_for_finding_songs = []

for items in ml_freq_pairs:
    res = len(set(user_favourite_tags) & set(items))
    uncommon_elements = set(user_favourite_tags) ^ set(items)
    if(res >= 5 ):
        #print(items)
        item_tags_for_finding_songs.append(uncommon_elements)   

In [None]:
songid_matches = {}

for index, row in songs_cleaned.iterrows():
    
    res = len(set(user_favourite_tags) & set(row['artist_terms']))
    if(res >= 6 ):
        songid_matches[row['song_id']] = res
        #print(res)
    
    #uncommon_elements = set(user_favourite_tags) ^ set(items)
    #print(row)

In [None]:

a = dict(sorted(songid_matches.items(), key=lambda x: x[1],  reverse=True) )
top10recommended = {k: a[k] for k in list(a)[:10]}

top10recommended

In [None]:
test_list1 = [5, 6, 4, 10, 7, 1, 19]
test_list2 = [6, 6, 10, 3, 7, 10, 19]
 
# printing original lists
print("The original list 1 is : " + str(test_list1))
print("The original list 2 is : " + str(test_list2))
 
# Identical element summation in lists
# using set() + len()
res = len(set(test_list1) & set(test_list2));
 
# printing result
print("Summation of Identical elements : " + str(res))

# Evaluation
* song is a good recommendation if it is the same genre

we decide that a it is a good recommendation if half the genres overlap. Between the query and the recommendations

In [None]:

def evaluate(query, recommendations):
    K = 10 # number of retrieved items to query song
    aps = []
    for i, song in enumerate(recommended_songs):
        p = np.zeros(K)  # precisions at k
        r = np.zeros(K)  # recalls at k
        df.song[]
        y_true = query # genres of the query
        y_pred = 1 # genres of the recommendation

        # k ranking
        for k in range(1, K+1):
            tp = np.sum((y_true == y_pred[:k]))

            p[k-1] = tp/len(y_pred[:k])
            # fraction of objects predicted to be positive among all positive objects
            r[k-1] = tp/K
            # True Positive Identification Rate (TPIR): 
            # Probability of observing the correct identity within the top K ranks

        # binarize predictions
        y_pred[y_pred != y_true] = 0
        y_pred[y_pred == y_true] = 1

        ap = 1/(y_pred.sum() + 1e-9) * (p @ y_pred)
        aps.append(ap)

    maP = np.mean(aps)
    return maP

In [None]:
songs_to_
for user in users:
    recommendation = get_recommendation(song)