Loading libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
import os
import time
from datasketch import MinHash, MinHashLSHForest


Read in users play counts

In [2]:
# # Method 1
# start_time = time.time()
# m = pd.read_csv('usertriplets_clean.csv')
# m = m.pivot(index='userID', columns='songID', values='play_count')
# print('It took %s seconds.' %(time.time()-start_time))
# #m.head()

In [3]:
# # Method 2
# start_time = time.time()
# frame = pd.read_csv('usertriplets_clean.csv')

# if not 'user_profiles' in globals():
#     person_u = list(frame.userID.unique())
#     thing_u = list(frame.songID.unique())

#     data = frame['play_count'].tolist()
#     row = frame.userID.astype('category').cat.codes
#     col = frame.songID.astype('category').cat.codes
#     sparse_matrix = csr_matrix((data, (row, col)), shape=(len(person_u), len(thing_u)))
#     user_profiles = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=person_u, columns=thing_u)
# print('It took %s seconds.' %(time.time()-start_time))
# #user_profiles.shape()



The second method improves runtime 10X times.

In [4]:
# #u = u.sparse.to_dense()
# start_time = time.time()

# similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', u  , similarity_metric = 'correlation', k = 4)
# print('It took %s seconds.' %(time.time()-start_time))


Since classic collaborative filtering method is too memory heavy (and cant run on my pc); try LSH approach

In [5]:

def triplets_to_utility_matrix(path_to_file ):
    start_time = time.time()
    frame = pd.read_csv(path_to_file)
    person_u = list(frame.userID.unique())
    thing_u = list(frame.songID.unique())
    data = frame['play_count'].tolist()
    row = frame.userID.astype('category').cat.codes
    col = frame.songID.astype('category').cat.codes
    sparse_matrix = csr_matrix((data, (row, col)), shape=(len(person_u), len(thing_u)))
    user_profiles = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=person_u, columns=thing_u)
    # Creater smaller dataset
    split = int(386670/5)
    u1 = user_profiles[:split]
    u1 = u1.sparse.to_dense()
    # Normalize play counts
    u1.replace(0, np.nan, inplace=True)
    u1 = u1.sub(u1.mean(axis=1, skipna=True), axis=0) # substract from each cell the row mean
    # if value >=0 user likes song
    u1[u1 >= 0] = 1
    u1[u1 < 0] = 0
    u1 = u1.T
    # Keep users that have liked more than 4 songs 
    cols = np.where(u1.notna().sum() >= 4)
    sub1 = u1.iloc[:, np.r_[cols]]
    print('It took %s seconds.' %(time.time()-start_time))

    return sub1

In [7]:
# Create utility matrix in chunks

# m with user_profiles[:split]
m = triplets_to_utility_matrix('data/usertriplets_clean.csv')

It took 70.11948895454407 seconds.


Run the above function with different splitting strategy to create a bigger database in chunks as shown in the commented cell bellow.

In [None]:
# # Make changes to triplets_to_utility_matrix functions !
#
# # m2 with user_profiles[split:split*2]
# m2 = triplets_to_utility_matrix('usertriplets_clean.csv')
# # m3 with user_profiles[split*2:split*3]
# m3 = triplets_to_utility_matrix('usertriplets_clean.csv')
# # m4 with user_profiles[split*3:split*4]
# m4 = triplets_to_utility_matrix('usertriplets_clean.csv')
#
# # Merging
# m_total = pd.concat([m, m2, m3, m4], axis=1)
#

In [8]:
# Create database for the LSH algo
def create_database(utility_matrix):
    start_time = time.time()
    cols = utility_matrix.columns.to_numpy() # the users 
    vectors_list = [cols[x].tolist() for x in utility_matrix.eq(1).to_numpy()] # each vector is a song, contains users that liked that song
    df_new = pd.DataFrame(vectors_list, index = utility_matrix.index)
    df_new_reduced = df_new.mask(df_new.eq('None')).dropna(how = 'all')
    df_new_reduced['users'] = df_new_reduced[df_new_reduced.columns].apply( lambda x: ','.join(x.dropna().astype(str)),axis=1)
    data = df_new_reduced['users'].copy()
    database = pd.DataFrame(data)
    print('It took %s seconds.' %(time.time()-start_time))

    return database

In [9]:
db = create_database(m)

It took 1.1063482761383057 seconds.


In [None]:
#db_total = create_database(m_total)

In [10]:
def get_items_from_user(userID, utility_matrix):
    '''Given a user find the list of items he likes'''
    #list containing the songs the user likes
    songs = utility_matrix.index[utility_matrix[userID].eq(1)].tolist() 
    
    return songs

In [11]:
#Preprocess will split a string of text into individual tokens/shingles based on ",".
def preprocess(text):
    tokens = text.split()
    return tokens

Create MiniHash forest

In [12]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for users in data['users']:
        tokens = preprocess(users) # list of users
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [13]:
def predict(song_profile, database, perms, num_results, forest): # song_profile in list form
    
    #song_profiles = get_items_from_user(userID, utility_matrix ,database)
    start_time = time.time()
    
    m = MinHash(num_perm=perms)
    for users in song_profile:
        m.update(users.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results*2))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    #result = database.iloc[idx_array]['users']
    result = database.iloc[idx_array]['users']

    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result, idx_array


Choose parameters:

In [14]:
#Number of Permutations
permutations = 256

#Number of recommendations to return
num_recommendations = 2

In [16]:
forest = get_forest(db, permutations)

It took 5.261036396026611 seconds to build forest.


In [17]:
def get_recommendations(userID, utility_matrix, database, permutations, num_recommendations, forest):
    items = get_items_from_user(userID, utility_matrix)
    for i in range(0,len(items)):
        # for each song user likes get the song's item_profile vector
        item_profile = database.loc[items[i]] #.to_numpy()
        # get recommendetions based on each song user likes
        result, i = predict(item_profile, database, permutations, num_recommendations, forest)
        print('\n Top Recommendation(s) is(are) \n', result)


In [None]:
# userID = 'bf30441e24ef5326354295723d9fe1edf59b8554'
# utility_matrix = m_total
# database = db_total

In [18]:
get_recommendations('bf30441e24ef5326354295723d9fe1edf59b8554',m,db, permutations, num_recommendations, forest)

It took 0.017992734909057617 seconds to query forest.

 Top Recommendation(s) is(are) 
 SOVGXVD12A58A7D23B    bf30441e24ef5326354295723d9fe1edf59b8554,722c6...
Name: users, dtype: object
It took 0.009572744369506836 seconds to query forest.

 Top Recommendation(s) is(are) 
 SOZQVTJ12A6701D96B    bf30441e24ef5326354295723d9fe1edf59b8554,34618...
Name: users, dtype: object


**Troubleshooting**: This implementation is not producing the expected outcome. Given more time, I would do debugging and further investigation. The LSH method is used extensively for document recommendation (with shingling).
As a **future extension**, I would try to apply this method to lyrics documents that match with our database data. In this way songs would be recommended based on their lyrics compared to lyrics of songs that the user likes.

In [None]:
# # Try with content based

# # predict function need to be changed


# # def predict(song_profile, database, perms, num_results, forest): # song_profile in list form
    
# #     #song_profiles = get_items_from_user(userID, utility_matrix ,database)
# #     start_time = time.time()
     
# #     tokens = preprocess(song_profile)
# #     m = MinHash(num_perm=perms)
# #     for s in tokens:
# #         m.update(s.encode('utf8'))
        
# #     idx_array = np.array(forest.query(m, num_results*2))
# #     if len(idx_array) == 0:
# #         return None # if your query is empty, return none
    
# #     #result = database.iloc[idx_array]['users']
# #     result = database.iloc[idx_array]['terms']

# #     print('It took %s seconds to query forest.' %(time.time()-start_time))
    
# #     return result, idx_array

# f = pd.read_csv('songs_clean.csv')
# # keep only 2 columns : song_id , artist_terms
# ff = f.iloc[:,[1,2]]
# ff = ff.set_index('song_id')
# import re
# regex = re.compile(r'[\"\'\\\[\]]')
# ff['artist_terms'] = ff['artist_terms'].apply(lambda x: regex.sub('', x)).astype('string')
# ff.loc['SOCIWDW12A8C13D406']

In [None]:
# Error
#get_recommendations('bf30441e24ef5326354295723d9fe1edf59b8554',m_total,ff, permutations, num_recommendations, forest)