Loading libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
import os
import time
from datasketch import MinHash, MinHashLSHForest


Read in users play counts

In [2]:
# # Method 1
# start_time = time.time()
# m = pd.read_csv('usertriplets_clean.csv')
# m = m.pivot(index='userID', columns='songID', values='play_count')
# print('It took %s seconds.' %(time.time()-start_time))
# #m.head()

In [3]:
# # Method 2
# start_time = time.time()
# frame = pd.read_csv('usertriplets_clean.csv')

# if not 'user_profiles' in globals():
#     person_u = list(frame.userID.unique())
#     thing_u = list(frame.songID.unique())

#     data = frame['play_count'].tolist()
#     row = frame.userID.astype('category').cat.codes
#     col = frame.songID.astype('category').cat.codes
#     sparse_matrix = csr_matrix((data, (row, col)), shape=(len(person_u), len(thing_u)))
#     user_profiles = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=person_u, columns=thing_u)
# print('It took %s seconds.' %(time.time()-start_time))
# #user_profiles.shape()



The second method improves runtime 10X times.

In [4]:
# #u = u.sparse.to_dense()
# start_time = time.time()

# similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', u  , similarity_metric = 'correlation', k = 4)
# print('It took %s seconds.' %(time.time()-start_time))


Since classic collaborative filtering method is too memory heavy (and cant run on my pc); try LSH approach

In [33]:

def triplets_to_utility_matrix(path_to_file):
    start_time = time.time()
    frame = pd.read_csv(path_to_file)
    person_u = list(frame.userID.unique())
    thing_u = list(frame.songID.unique())
    data = frame['play_count'].tolist()
    row = frame.userID.astype('category').cat.codes
    col = frame.songID.astype('category').cat.codes
    sparse_matrix = csr_matrix((data, (row, col)), shape=(len(person_u), len(thing_u)))
    user_profiles = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=person_u, columns=thing_u)
    # Creater smaller dataset
    split = int(386670/5)
    u1 = user_profiles[:split]
    u1 = u1.sparse.to_dense()
    # Normalize play counts
    u1.replace(0, np.nan, inplace=True)
    u1 = u1.sub(u1.mean(axis=1, skipna=True), axis=0) # substract from each cell the row mean
    # if value >=0 user likes song
    u1[u1 >= 0] = 1
    u1[u1 < 0] = 0
    u1 = u1.T
    # Keep users that have liked more than 4 songs 
    cols = np.where(u1.notna().sum() >= 4)
    sub1 = u1.iloc[:, np.r_[cols]]
    print('It took %s seconds.' %(time.time()-start_time))

    return sub1

In [6]:
# Create utility matrix in chunks

# m with user_profiles[:split]
m = triplets_to_utility_matrix('usertriplets_clean.csv')

It took 45.75522828102112 seconds.


In [30]:
# Make changes to triplets_to_utility_matrix functions !

# m2 with user_profiles[split:split*2]
m2 = triplets_to_utility_matrix('usertriplets_clean.csv')
# m3 with user_profiles[split*2:split*3]
m3 = triplets_to_utility_matrix('usertriplets_clean.csv')
# m4 with user_profiles[split*3:split*4]
m4 = triplets_to_utility_matrix('usertriplets_clean.csv')

# Merging
m_total = pd.concat([m, m2, m3, m4], axis=1)

It took 46.413442611694336 seconds.


In [47]:
m_total

Unnamed: 0,bd8475385f0aa78830fa6dfce9e7242164b035c8,bf30441e24ef5326354295723d9fe1edf59b8554,951945330eb5df161ac4f97729647514001cd102,523a8a39456d5a96ae8f4d5e8b8b60f3bfb31528,1a849df9dabb15845eb932d46d81e2fd77176786,a4a1daac7ba5534ade06e32c5b9d220eb5ce145d,9b887e10a4711486085c4fae2d2599fc0d2c484d,8dcb524ff75e0ad0f0a80aaccadf7bbaa3b89a41,4691b4c353503da2c108e372ff056a9ac847c4d1,956999576244ad42d6d41faac8505fbef0a4ccc1,...,a9e01d8d871c468467d28b7e8d678d44a2b4005e,a0449ef813f2f0f63012eb57f00f9641b53e376c,05e3da99d1fc26977ae5f72e0f074dcb4224b0d9,c8a5c8a51896b2663bfeb8586430c7f87474f239,ef8d0169f39fd36e5f6d817145383c286ad0b4c3,e507969026ea80b8c467794c3844837b544ac9bf,a5bae51074cedb5d495c0d43ebc61c143f0f3cb9,5478929bddd74d5de447b3b6821f2c21516f4d5d,1a1b99c4858c2612078af62e99bf29eda8210ab1,fff6e149e03e127049923416942a9da755b7426d
SOWEZSI12A81C21CE6,,,,,,,,,,,...,,,,,,,,,,
SODCXXY12AB0187452,,,,,,,,,,,...,,,,,,,,,,
SOLXDDC12A6701FBFD,,,,,,,,,,,...,,,,,,,,,,
SONQBUB12A6D4F8ED0,,,,,,,,,,,...,,,,,,,,,,
SOFKTPP12A8C1385CA,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOJSIAL12AF72A275E,,,,,,,,,,,...,,,,,,,,,,
SOKTKTM12AB0189837,,,,,,,,,,,...,,,,,,,,,,
SOMLFVN12A8C136135,,,,,,,,,,,...,,,,,,,,,,
SOYUIMG12A8C135784,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# Create database for the LSH algo
def create_database(utility_matrix):
    start_time = time.time()
    cols = utility_matrix.columns.to_numpy() # the users 
    vectors_list = [cols[x].tolist() for x in utility_matrix.eq(1).to_numpy()] # each vector is a song, contains users that liked that song
    df_new = pd.DataFrame(vectors_list, index = utility_matrix.index)
    df_new_reduced = df_new.mask(df_new.eq('None')).dropna(how = 'all')
    df_new_reduced['users'] = df_new_reduced[df_new_reduced.columns].apply( lambda x: ','.join(x.dropna().astype(str)),axis=1)
    data = df_new_reduced['users'].copy()
    database = pd.DataFrame(data)
    print('It took %s seconds.' %(time.time()-start_time))

    return database

In [13]:
db = create_database(m)

It took 0.9921047687530518 seconds.


In [48]:
db_total = create_database(m_total)

It took 4.521599531173706 seconds.


In [51]:
db

Unnamed: 0,users
SOWEZSI12A81C21CE6,"61a57fae431b7ab17ebb9bf3c408e2a7d6ff0d62,70c5c..."
SOLXDDC12A6701FBFD,b015c0d9bb403ca218d0672769cb8468bcd50a3f
SONQBUB12A6D4F8ED0,"90b35559e7272da839b2bea34ccc241278da5324,97193..."
SOFKTPP12A8C1385CA,873f91eaf7f92497e08313fa32d05b779e99140e
SOGJPMB12A8C13A9DB,"2a68c931ed5701078c0cb96a3bb2bd6a8fec8c3f,dfbc0..."
...,...
SOVYKHK12AC3DF4D3A,"c27a66e7e40af122dc1ba50a45d77e1f23a6c247,ca40e..."
SOJSIAL12AF72A275E,c75aa16dc040b99a803b59cd9418f16a0e4f3519
SOMLFVN12A8C136135,"8b85519a101b28804ecd587783c5fcdffdf3bcfb,d82ce..."
SOYUIMG12A8C135784,80ba64fe659d9352b9b764302a7d85b31f630958


In [134]:
db_total

Unnamed: 0,users
SOWEZSI12A81C21CE6,"61a57fae431b7ab17ebb9bf3c408e2a7d6ff0d62,70c5c..."
SODCXXY12AB0187452,666db87fe5a15521407b7ccc308208f35be51530
SOLXDDC12A6701FBFD,"b015c0d9bb403ca218d0672769cb8468bcd50a3f,02fdc..."
SONQBUB12A6D4F8ED0,"90b35559e7272da839b2bea34ccc241278da5324,97193..."
SOFKTPP12A8C1385CA,"873f91eaf7f92497e08313fa32d05b779e99140e,06104..."
...,...
SOJSIAL12AF72A275E,"c75aa16dc040b99a803b59cd9418f16a0e4f3519,5e259..."
SOKTKTM12AB0189837,58b17834dc9fe66c897b5220f3b5a0875b378fae
SOMLFVN12A8C136135,"8b85519a101b28804ecd587783c5fcdffdf3bcfb,d82ce..."
SOYUIMG12A8C135784,"80ba64fe659d9352b9b764302a7d85b31f630958,58d55..."


In [135]:
def get_items_from_user(userID, utility_matrix):
    '''Given a user find the list of items he likes'''
    #list containing the songs the user likes
    songs = utility_matrix.index[utility_matrix[userID].eq(1)].tolist() 
    
    return songs

In [136]:
#Preprocess will split a string of text into individual tokens/shingles based on ",".
def preprocess(text):
    tokens = text.split()
    return tokens

Create MiniHash forest

In [137]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for users in data['users']:
        tokens = preprocess(users) # list of users
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [138]:
def predict(song_profile, database, perms, num_results, forest): # song_profile in list form
    
    #song_profiles = get_items_from_user(userID, utility_matrix ,database)
    start_time = time.time()
    
    m = MinHash(num_perm=perms)
    for users in song_profile:
        m.update(users.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results*2))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    #result = database.iloc[idx_array]['users']
    result = database.iloc[idx_array]['users']

    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result, idx_array


Choose parameters:

In [21]:
#Number of Permutations
permutations = 256

#Number of recommendations to return
num_recommendations = 2

In [52]:
forest = get_forest(db_total, permutations)

It took 7.574982643127441 seconds to build forest.


In [127]:
def get_recommendations(userID, utility_matrix, database, permutations, num_recommendations, forest):
    items = get_items_from_user(userID, utility_matrix)
    for i in range(0,len(items)):
        # for each song user likes get the song's item_profile vector
        item_profile = database.loc[items[i]] #.to_numpy()
        # get recommendetions based on each song user likes
        result, i = predict(item_profile, database, permutations, num_recommendations, forest)
        print('\n Top Recommendation(s) is(are) \n', result)


In [24]:
# userID = 'bf30441e24ef5326354295723d9fe1edf59b8554'
# utility_matrix = m
# database =db

In [132]:
get_recommendations('bf30441e24ef5326354295723d9fe1edf59b8554',m_total,db_total, permutations, num_recommendations, forest)

It took 0.015625 seconds to query forest.

 Top Recommendation(s) is(are) 
 SOVGXVD12A58A7D23B    bf30441e24ef5326354295723d9fe1edf59b8554,722c6...
Name: users, dtype: object
It took 0.015626907348632812 seconds to query forest.

 Top Recommendation(s) is(are) 
 SOZQVTJ12A6701D96B    bf30441e24ef5326354295723d9fe1edf59b8554,34618...
Name: users, dtype: object


In [133]:
# # Try with content based

# # predict function need to be changed


# # def predict(song_profile, database, perms, num_results, forest): # song_profile in list form
    
# #     #song_profiles = get_items_from_user(userID, utility_matrix ,database)
# #     start_time = time.time()
     
# #     tokens = preprocess(song_profile)
# #     m = MinHash(num_perm=perms)
# #     for s in tokens:
# #         m.update(s.encode('utf8'))
        
# #     idx_array = np.array(forest.query(m, num_results*2))
# #     if len(idx_array) == 0:
# #         return None # if your query is empty, return none
    
# #     #result = database.iloc[idx_array]['users']
# #     result = database.iloc[idx_array]['terms']

# #     print('It took %s seconds to query forest.' %(time.time()-start_time))
    
# #     return result, idx_array

# f = pd.read_csv('songs_clean.csv')
# # keep only 2 columns : song_id , artist_terms
# ff = f.iloc[:,[1,2]]
# ff = ff.set_index('song_id')
# import re
# regex = re.compile(r'[\"\'\\\[\]]')
# ff['artist_terms'] = ff['artist_terms'].apply(lambda x: regex.sub('', x)).astype('string')
# ff.loc['SOCIWDW12A8C13D406']

In [130]:
# Error
#get_recommendations('bf30441e24ef5326354295723d9fe1edf59b8554',m_total,ff, permutations, num_recommendations, forest)

## Predict play count for a user-song combination based on user-user

In [26]:
# def predict_play_count_uu(userID, songID, user_profiles, similarity_metric, k):
#     '''Predict play count for a particular user-song tuple, based on user-to-user similarity. Use with cosine similarity.'''
#     prediction = 0
#     similarity, indices = get_similarusers(userID, user_profiles, similarity_metric, k) #similar users based on cosine similarity
#     # get mean play count for a user, to adjust
#     mean_play_count = user_profiles.loc[userID, :].mean() 
#     # weight_i is the similarity of neigbhor_i to user X
#     sum_of_similarity = np.sum(similarity) - 1 # -1 because user 1 is included, has a similarity of 1
    
#     # initializing variables
#     w_similarity = 1
#     weighted_sum = 0
    
#     for i in range(0, len(indices.flatten())):
#         if user_profiles.index[indices.flatten()[i]] == userID:
#             continue;
#         else:
#             # Normalize ratings for a given user by subtracting row mean (centered cosine, or pearson cor)
#             play_count_dif = user_profiles.iloc[indices.flatten()[i],user_profiles.columns.get_loc(songID)] - np.mean(user_profiles.iloc[indices.flatten()[i],:])
#             w_similarity = play_count_dif*similarity[i]
#             weighted_sum += w_similarity
            
#     prediction = mean_play_count + (weighted_sum/sum_of_similarity)
#     print('Predicted rating for user {} -> song {}: {}'.format(userID, songID, prediction))

#     return prediction

In [27]:
#predict_play_count_uu('5a905f000fc1ff3df7ca807d57edb608863db05d', 'SOZZYAO12A6701FF36', user_profiles, 'cosine', 4 )