# Locality Sensitive hashing in Collaborative item-item filtering (inspired by week5 : SimilarItems)

Explain:
- LSH vs KNN, and why LSH is more efficient
- Present a scheme of how LSH algorithm operates
- Why we use Jaccard distance here (and how we make the decision for binary outcome, "likes":1/"dislikes":0 )

Imports

In [1]:
import numpy as np
import pandas as pd
import time
from datasketch import MinHash, MinHashLSHForest

Read in the data:
    
    userID \tab songID \tab play_count

In [5]:
u_matrix = pd.read_csv('triplets_50000.txt', sep='\t', names = ['userID','songID', 'play_count'])

Pivot to tranform the data from long to wide:

In [6]:
u_matrix = u_matrix.pivot(index="userID", columns="songID", values="play_count")

In [5]:
# u_m = u_matrix.fillna(0) # Replace the NaN with 0s.

In [5]:
total_count_list = u_matrix.sum(axis=1, skipna=True)

In [6]:
print("The greatest listener's total play count is", np.max(total_count_list))
print("The lowest total play count of a user is", np.min(total_count_list))

The greatest listener's total play count is 2478.0
The lowest total play count of a user is 10.0


### Way num 1: normalize users and tranform to 0, 1

So it's a good idea, before we start, to normalize for "big" and "low" listeners:

In [7]:
u_normalized = u_matrix.sub(u_matrix.mean(axis=1, skipna=True), axis=0) # substract from each cell the row mean

In [8]:
u_normalized.shape

(1010, 27898)

In [4]:
u_normalized

NameError: name 'u_normalized' is not defined

Lets use the rule:

    if the norm play count is < 0 == the user didn't like the song
    if the norm play count is > 0 == the user likes the song

In [10]:
# u_binary = np.where(u_normalized[u_normalized.columns] < 0, 0, 1)

In [9]:
df = u_normalized.copy()

In [10]:
df[df >= 0] = 1
df[df < 0] = 0

In [11]:
df.shape

(1010, 27898)

In [12]:
df = df.T
df

userID,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2,03c90bfd09151973863c4cadd5a749cd7982abc0,...,fca15b7964a099b2860dfdb158a2430fb10c4384,fd13b9d49c54e00ff413fe3c095ba581c7fc611e,fd585aef5c32d3943bd6e7f9f39aa216ba659fd0,fdb815231ee1d66f383b80d279bd58769dfe59ff,fe76c9d535c5834e4a9b91c13e29be6460cb79c4,fe979a7b199de3ee8a78486c10e5ed13587fc359,fed37c4c49c9f217b3371c2f2c0e7541656e55cf,ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,ff4322e94814d3c7895d07e6f94139b092862611,ffadf9297a99945c0513cd87939d91d8b602936b
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SOAAAGQ12A8C1420C8,,,,,,,,,,,...,,,,,,,,,,
SOAACPJ12A81C21360,,,,,,,,,,,...,,,,,,,,,,
SOAACTC12AB0186A20,,,,,,,,,,,...,,,,,,,,,,
SOAADCB12A81C22AFA,,,,,,,,,,,...,,,,,,,,,,
SOAAEJI12AB0188AB5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOZZXAO12A58A7D379,,,,,,,,,,,...,,,,,,,,,,
SOZZYAO12A6701FF36,,,,,,,,,,,...,,,,,,,,,,
SOZZYDA12AB01824FB,,,,,,,,,,,...,,,,,,,,,,
SOZZYMH12AB0180A51,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# Transform the data for the LSH algo
start_time = time.time()
cols = df.columns.to_numpy() # the users 
vectors_list = [cols[x].tolist() for x in df.eq(1).to_numpy()] # each vector is a song, contains users that liked that song
print('It took %s seconds.' %(time.time()-start_time))

It took 0.11248517036437988 seconds.


In [14]:
# sanity check
len(vectors_list) # should be equal to the 27898 columns (songs)

27898

In [27]:
df_new = pd.DataFrame(vectors_list, index = df.index)
# df_new.shape # (27898, 49)


In [29]:
# drop rows with none
df_new_reduced = df_new.mask(df_new.eq('None')).dropna(how = 'all') 

In [86]:
df_new_reduced['users'] = df_new_reduced[df_new_reduced.columns].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
df_new_reduced

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,users
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SOAADCB12A81C22AFA,18dc95f354220b343e98ebbb7c8564291284ed9f,,,,,,,,,,...,,,,,,,,,,18dc95f354220b343e98ebbb7c8564291284ed9f
SOAAEKX12A6D4F7E4E,087b949b3fde63ddab83d61d4ba82a6f162bd50c,,,,,,,,,,...,,,,,,,,,,087b949b3fde63ddab83d61d4ba82a6f162bd50c
SOAAFYH12A8C13717A,5f07309f801582fc0fce1c176718d065fa6ea079,,,,,,,,,,...,,,,,,,,,,5f07309f801582fc0fce1c176718d065fa6ea079
SOAAKFY12A6D4F7B03,5d579eba844597fd498217472f07a83c26d71e57,,,,,,,,,,...,,,,,,,,,,5d579eba844597fd498217472f07a83c26d71e57
SOAAKPM12A58A77210,c3953e49b785c940f53a556abd9ca681c5ef48c5,,,,,,,,,,...,,,,,,,,,,c3953e49b785c940f53a556abd9ca681c5ef48c5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOZZPSS12A6D4F3C71,5a905f000fc1ff3df7ca807d57edb608863db05d,,,,,,,,,,...,,,,,,,,,,5a905f000fc1ff3df7ca807d57edb608863db05d
SOZZTCU12AB0182C58,be0a4b64e9689c46e94b5a9a9c7910ee61aeb16f,ec0bb33707cdc61a6999b41585a0e1f39d0ae6d3,,,,,,,,,...,,,,,,,,,,"be0a4b64e9689c46e94b5a9a9c7910ee61aeb16f,ec0bb..."
SOZZTNF12A8C139916,76235885b32c4e8c82760c340dc54f9b608d7d7e,,,,,,,,,,...,,,,,,,,,,76235885b32c4e8c82760c340dc54f9b608d7d7e
SOZZWZV12A67AE140F,956044d724390e40c8511b49e5bf6bc28071de3a,,,,,,,,,,...,,,,,,,,,,956044d724390e40c8511b49e5bf6bc28071de3a


In [87]:
data = df_new_reduced['users'].copy()

In [88]:
data = pd.DataFrame(data)

In [89]:
data

Unnamed: 0_level_0,users
songID,Unnamed: 1_level_1
SOAADCB12A81C22AFA,18dc95f354220b343e98ebbb7c8564291284ed9f
SOAAEKX12A6D4F7E4E,087b949b3fde63ddab83d61d4ba82a6f162bd50c
SOAAFYH12A8C13717A,5f07309f801582fc0fce1c176718d065fa6ea079
SOAAKFY12A6D4F7B03,5d579eba844597fd498217472f07a83c26d71e57
SOAAKPM12A58A77210,c3953e49b785c940f53a556abd9ca681c5ef48c5
...,...
SOZZPSS12A6D4F3C71,5a905f000fc1ff3df7ca807d57edb608863db05d
SOZZTCU12AB0182C58,"be0a4b64e9689c46e94b5a9a9c7910ee61aeb16f,ec0bb..."
SOZZTNF12A8C139916,76235885b32c4e8c82760c340dc54f9b608d7d7e
SOZZWZV12A67AE140F,956044d724390e40c8511b49e5bf6bc28071de3a


Choose parameters:

In [75]:
#Number of Permutations
permutations = 128

#Number of Recommendations to return
num_recommendations = 1

Create MiniHash forest

In [76]:
#Preprocess will split a string of text into individual tokens/shingles based on ",".
def preprocess(text):
    tokens = text.split()
    return tokens


In [77]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for users in data['users']:
        tokens = preprocess(users) # list of users
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest


In [90]:
forest = get_forest(data, permutations)

It took 20.24768352508545 seconds to build forest.


In [78]:
def predict(song_profile, database, perms, num_results, forest): # song_profile in list form
    start_time = time.time()
    
    m = MinHash(num_perm=perms)
    for users in song_profile:
        m.update(users.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['users']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result


In [91]:
num_recommendations = 10
song_profile = [ '18dc95f354220b343e98ebbb7c8564291284ed9f', '5a905f000fc1ff3df7ca807d57edb608863db05d'] # i made this random profile of an imaginary user manually, make it work with songID
result = predict(song_profile, data, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)


It took 0.007995843887329102 seconds to query forest.

 Top Recommendation(s) is(are) 
 songID
SODWVWI12A81C21791    5a905f000fc1ff3df7ca807d57edb608863db05d
SODZLWJ12AF729FB85    5a905f000fc1ff3df7ca807d57edb608863db05d
SODBCUG12A67AE0927    5a905f000fc1ff3df7ca807d57edb608863db05d
SOAPIHX12AB0184CB1    5a905f000fc1ff3df7ca807d57edb608863db05d
SOADGFH12A8C143D89    5a905f000fc1ff3df7ca807d57edb608863db05d
SOBJIZY12A6701F11A    5a905f000fc1ff3df7ca807d57edb608863db05d
SOCSHHV12A8AE45F45    5a905f000fc1ff3df7ca807d57edb608863db05d
SOCQOKM12AB018C7F8    5a905f000fc1ff3df7ca807d57edb608863db05d
SOAFOBL12AF72A25BA    5a905f000fc1ff3df7ca807d57edb608863db05d
SODTJFU12B0B80C9BE    5a905f000fc1ff3df7ca807d57edb608863db05d
Name: users, dtype: object


## Way num. 2 pick for each song the top n users 

In [61]:
n = 10
t3 = u_matrix.T.apply(lambda x: pd.Series(x.nlargest(n).index), axis=1)

In [62]:
print(u_matrix.T.shape,'\n', t3.shape)

(27898, 1010) 
 (27898, 10)


In [63]:
t3.head(10)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SOAAAGQ12A8C1420C8,3ff7a31452eeabd7a4e07f0d243c674e3d0adf46,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2
SOAACPJ12A81C21360,fb2f2c0e39e233622c300c232a6287738007e34a,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2
SOAACTC12AB0186A20,cd7baa603618c40b9290bd05112061afefdde8ef,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2
SOAADCB12A81C22AFA,18dc95f354220b343e98ebbb7c8564291284ed9f,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2
SOAAEJI12AB0188AB5,28b232e7ecb32c47c05b795a017786d4be96ef7e,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2
SOAAEKX12A6D4F7E4E,087b949b3fde63ddab83d61d4ba82a6f162bd50c,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2
SOAAFAC12A67ADF7EB,76235885b32c4e8c82760c340dc54f9b608d7d7e,7e2b716907a9a295d92c2f35d7e260aeee2b32cf,c24ec42f0e449ff39a95a01f0795f833b898f71b,fdb815231ee1d66f383b80d279bd58769dfe59ff,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031
SOAAFYH12A8C13717A,5f07309f801582fc0fce1c176718d065fa6ea079,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2
SOAAIJG12AAA15D821,32fe8a8c93847b8771d357411d3aa73413d72779,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2
SOAAIWE12A8AE4706B,9dc8b0f000792df949a0a0ad8eba2724335568f7,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2


In [70]:
# Tranform in the prefered format for the function 
c = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
t3['users'] = t3.iloc[:, :].apply(",".join, axis=1)

In [71]:
t3.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,users
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SOAAAGQ12A8C1420C8,3ff7a31452eeabd7a4e07f0d243c674e3d0adf46,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2,"3ff7a31452eeabd7a4e07f0d243c674e3d0adf46,0007c..."
SOAACPJ12A81C21360,fb2f2c0e39e233622c300c232a6287738007e34a,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2,"fb2f2c0e39e233622c300c232a6287738007e34a,0007c..."
SOAACTC12AB0186A20,cd7baa603618c40b9290bd05112061afefdde8ef,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2,"cd7baa603618c40b9290bd05112061afefdde8ef,0007c..."
SOAADCB12A81C22AFA,18dc95f354220b343e98ebbb7c8564291284ed9f,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2,"18dc95f354220b343e98ebbb7c8564291284ed9f,0007c..."
SOAAEJI12AB0188AB5,28b232e7ecb32c47c05b795a017786d4be96ef7e,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2,"28b232e7ecb32c47c05b795a017786d4be96ef7e,0007c..."


In [72]:
data2 = t3.drop(t3.columns[range(0,10)], axis=1)

In [73]:
data2

Unnamed: 0_level_0,users
songID,Unnamed: 1_level_1
SOAAAGQ12A8C1420C8,"3ff7a31452eeabd7a4e07f0d243c674e3d0adf46,0007c..."
SOAACPJ12A81C21360,"fb2f2c0e39e233622c300c232a6287738007e34a,0007c..."
SOAACTC12AB0186A20,"cd7baa603618c40b9290bd05112061afefdde8ef,0007c..."
SOAADCB12A81C22AFA,"18dc95f354220b343e98ebbb7c8564291284ed9f,0007c..."
SOAAEJI12AB0188AB5,"28b232e7ecb32c47c05b795a017786d4be96ef7e,0007c..."
...,...
SOZZXAO12A58A7D379,"4208d4ac45e7caab7167a4ea6d34e759a6b9a1fc,0007c..."
SOZZYAO12A6701FF36,"5a905f000fc1ff3df7ca807d57edb608863db05d,0007c..."
SOZZYDA12AB01824FB,"2c218a60b3d777e9e12d56c2e065a9644b5e5f41,0007c..."
SOZZYMH12AB0180A51,"9dc8b0f000792df949a0a0ad8eba2724335568f7,0007c..."


Now we are ready to use the LSH function

In [79]:
forest2 = get_forest(data2, permutations)

It took 58.46970200538635 seconds to build forest.


In [84]:
num_recommendations = 10
user_profile = [ '18dc95f354220b343e98ebbb7c8564291284ed9f'] # i made this random profile of an imaginary user manually, make it work with songID
result2 = predict(song_profile, data2, permutations, num_recommendations, forest2)
print('\n Top Recommendation(s) is(are) \n', result2)


 Top Recommendation(s) is(are) 
 None
