In [920]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy.sparse import csr_matrix
import time

In [921]:
play = pd.read_csv('../data/10kusers_100songs.csv', sep=',')
play.columns=['user','song','listen_count']

In [922]:
print ("Users-Songs: ", play['user'].count())
print ("Unique Users: ", play['user'].nunique())
print ("Unique Songs: ", play['song'].nunique())

('Users-Songs: ', 110449)
('Unique Users: ', 9406)
('Unique Songs: ', 100)


In [923]:
def split_data_train_test_random(self, train_proportion, test_proportion, random_state=42):
    df_train, df_test = train_test_split(self, train_size=train_proportion,
                                         test_size=test_proportion, random_state=42)
    return df_train, df_test


train_set,test_set = split_data_train_test_random(play,.8,.2)

In [924]:
print ( "Training:", "user-song pairs -", train_set['user'].count(), "unique users -", train_set['user'].nunique() ,"unique songs -", train_set['song'].nunique())
print ( "Testing :", "user-song pairs -", test_set['user'].count(), "unique users -", test_set['user'].nunique() ,"unique songs -", test_set['song'].nunique())

('Training:', 'user-song pairs -', 88359, 'unique users -', 9211, 'unique songs -', 100)
('Testing :', 'user-song pairs -', 22090, 'unique users -', 7002, 'unique songs -', 100)


In [925]:
users = train_set['user'].unique()
songs = train_set['song'].unique()

songs_dict  = dict(zip(songs, range(len(songs))))
users_dict = dict(zip(users, range(len(users))))

data = np.zeros((train_set['user'].nunique(),train_set['song'].nunique()))
for index,row in train_set.iterrows():
    data[users_dict[row['user']], songs_dict[row['song']]] = row['listen_count']


In [926]:
def baseline_prediction(df):
    baseline = df['listen_count'].mean()
    return baseline

baseline = baseline_prediction(train_set)
print baseline

3.10388302267


In [927]:
play_matrix = play.pivot(index='song', columns='user', values='listen_count')
play_matrix = play_matrix.fillna(0)
training_matrix = train_set.pivot(index='song', columns='user', values='listen_count')
training_matrix = training_matrix.fillna(0)


In [1044]:
def create_hash_functions(data, r):
    # Create Hash Functions
    # Inputs: (song,user) matrix; r - number of hash function for each band
    # Output = return the r hash functions
    
    rows,cols = data.shape
    hash_functions = np.random.standard_normal(size=(cols,r))
    return hash_functions

def create_hash_buckets(data, hash_functions, r):
    # Inputs: (song, user) matrix, the has functions created earlier
    # Ouput: Create a dictionary of hash values and the correponding matching song indices.
    # Generate a signature matrix from the hash functions and map it to the hash value to collect similar items
    
    hash_signatures = np.dot(data,hash_functions) >= 0    
    func_arr = 2 ** np.array(range(r -1 ,-1,-1))
    hash_value = np.dot(hash_signatures,func_arr)
    
    dict_hash = {}
    
    for i in range(len(hash_value)):
        if hash_value[i] not in dict_hash:
            dict_hash[hash_value[i]] = list()
        dict_hash[hash_value[i]].append(i)
        
    return dict_hash


def create_hash_tables(data, r, b):
    #Input: data - (song,user) matrix
    # r - number of hash functions in each band
    # b - number of hash buckets
    
    rows, cols = data.shape
    l_dict_hash = []
    l_hash_functions = []
    for i in range(b):
        h_functions = create_hash_functions(data, r)
        h_dict = create_hash_buckets(data, h_functions, r)
        l_dict_hash.append(h_dict)
        l_hash_functions.append(h_functions)
    return l_hash_functions, l_dict_hash


def similar_items(data, idx, l_hash_functions, l_dict_hash, r , b  ):
    # Input - index of the song
    # Output - list of songs and corresponding similarity distance metric for the given song
    # This returns the similar songs for a given song based on it's hash value.
    
    func_arr = 2 ** np.array(range(r -1 ,-1,-1))
    sim_items = set()
    for i in range(b):
        hash_signatures = np.dot(data[idx,:],l_hash_functions[i]) >= 0
        hash_val = np.dot(hash_signatures,func_arr)
        
        if hash_val in l_dict_hash[i]:
            sim_items.update(l_dict_hash[i][hash_val])

    sim_items.remove(idx)
    
    sim_ndarray = data[np.array(list(sim_items)), :]
    sims = cosine_similarity(csr_matrix(sim_ndarray), csr_matrix(data[idx,:].reshape(1,-1)))[:,0]
#     sims = pairwise_distances(csr_matrix(sim_ndarray), csr_matrix(data[idx,:].reshape(1,-1)), 'euclidean')[:,0]
#     sims = 1/(1+sims)    
    
    return list(zip(list(sim_items), sims))    

def all_similar_items(data, idx, l_hash_functions, l_dict_hash, r , b  ):
    # Input - index of the song
    # data - song,user matrix
    # Output - list of songs and corresponding similarity distance metric for the given song
    # This returns the similar songs for a given song based on it's hash value.
    
    sim_items = set()
    for i in range(data.shape[0]):
        sim_items.update([i])

    sim_items.remove(idx)
#     print sim_items
    
    sim_ndarray = data[np.array(list(sim_items)), :]
    sims = cosine_similarity(csr_matrix(sim_ndarray), csr_matrix(data[idx,:].reshape(1,-1)))[:,0]
#     sims = pairwise_distances(csr_matrix(sim_ndarray), csr_matrix(data[idx,:].reshape(1,-1)), 'euclidean')[:,0]
#     sims = 1/(1+sims)    
    
    return list(zip(list(sim_items), sims))    

In [1045]:
def songs_sim_matrix_rb(songs_rb, data):
    # Inputs: Similarity metrics of list with a bunch of tuples; data of the shape (songs, users)
    # outout: Similarity matrix.
    
    sim_songs_matrix_rb = np.zeros((data.shape[0],data.shape[0]))
    
    for i in range(data.shape[0]):
        for j in range(len(songs_rb[i])):
            sim_songs_matrix_rb[i][songs_rb[i][j][0]] = songs_rb[i][j][1]
            
    return sim_songs_matrix_rb


def predict_individual(song_sim_matrix,data, user_idx, song_idx, baseline, K):
    # song similar matrix : item-item matrix with similarity measures
    # data : user,song matrix
    # user_idx : index of the user for prediction
    # song_idx : index of the song for prediction
    # returns prediction    
    user_items = data[user_idx].nonzero()[0]
    if len(user_items) == 0:
        return baseline
    
    neighbors = user_items[song_sim_matrix[song_idx, user_items].argsort()[::-1][0:K]]
    
    pred = (data[user_idx,neighbors].dot(song_sim_matrix[song_idx,neighbors])/sum(song_sim_matrix[song_idx,neighbors]))
    
    if np.isnan(pred):
        pred = baseline
    return pred

def predict_user_topNsongs(data,user_id,song_sim_matrix, K, N, users_dict,songs_dict,baseline, already_listened=False):
    # data - user,song matrix
    # User ID for prediction
    # song Similarity matrix
    # K - for neighbors
    # N - for prediction to the user.
    # users_dict = dictionary to convert user Id to index.
    # songs_dict - dictionary to convert sond id to index.
    
    n = song_sim_matrix.shape[0]
    
    try:
        user_idx = users_dict[user_id]
    except:
        raise "Application ERROR: User Not Found"         
    
    inverse_songs_dict = dict(zip(songs_dict.values(), songs_dict.keys()))
    listened_items = data[user_idx].nonzero()[0]
    
    temp = []
    for i in range(len(listened_items)):
        temp.append(listened_items[i])
    
    preds = {}
    listened = {}
    
    for j in range(n):
        preds[j] = predict_individual(song_sim_matrix,data,user_idx,j,baseline,K)
    
    for k in listened_items:
#         listened[k] = data[user_idx,k]
        listened[k] = preds[k]
        if not already_listened:
            del preds[k]        
            
    sort_listened = sorted(listened.items(), key=lambda x: x[1], reverse = True)
    sort_pred = sorted(preds.items(), key=lambda x: x[1], reverse = True)

    top_recommendation = [inverse_songs_dict[i[0]] for i in sort_pred[:N]]
    top_listened = [inverse_songs_dict[i[0]] for i in sort_listened[:N]]
    
    
    df_pred = pd.DataFrame({ 'pred' : [sort_pred[i][1] for i in range(len(sort_pred))] ,
                            'song_id' : [inverse_songs_dict[i[0]] for i in sort_pred[:]], 
                            'song_idx' : [sort_pred[i][0] for i in range(len(sort_pred))] 
                           })

    df_listened = pd.DataFrame({ 'pred' : [sort_listened[i][1] for i in range(len(sort_listened))] ,
                            'song_id' : [inverse_songs_dict[i[0]] for i in sort_listened[:]], 
                            'song_idx' : [sort_listened[i][0] for i in range(len(sort_listened))] ,
                            'listen_count' : [data[user_idx,i[0]] for i in sort_listened[:]]
                           })

       
    return df_pred, df_listened, top_recommendation,  top_listened
    


In [1046]:
test1 = all_similar_items(data.T, 5, l_hash, l_dict, r , b  )

In [1047]:
r = 3
b = 10
l_hash, l_dict = create_hash_tables(data.T, r,b)
test1 = similar_items(data.T, 5, l_hash, l_dict, r , b  )

songs_33 = []
start_time = time.time()
for i in range(len(data.T)):
#     songs_33.append(similar_items(data.T, i, l_hash, l_dict, r , b  ))
    songs_33.append(similar_items(data.T, i, l_hash, l_dict, r , b  ))
end_time = time.time()
print end_time-start_time

song_sim_matrix = songs_sim_matrix_rb(songs_33, data.T)
    
df_pred, df_listened, top_recommendation,  top_listened = predict_user_topNsongs(data,
                                                         '6a944bfe30ae8d6b873139e8305ae131f1607d5f',
                                                          song_sim_matrix, 10, 10, users_dict, songs_dict,
                                                          baseline, False )



# 7c160d10387a7d1b84a2f6f3f318037ed91e9feb - phani
# 002b63a7e2247de6d62bc62f253474edc7dd044c - harish
# 6b2787bcf47cfea75b734729cdaddd700137ad5a - max count 900 - not in train
# 6a944bfe30ae8d6b873139e8305ae131f1607d5f 

1.41766095161


In [1048]:
song_info = pd.read_csv('../data/song_info.csv')[['song_id', 'artist_name', 'track_name']]


In [1049]:
df_listened[['song_id', 'listen_count', 'pred']]

Unnamed: 0,song_id,listen_count,pred
0,SOSXLTC12AF72A7F54,15.0,53.207586
1,SOOFYTN12A6D4F9B35,6.0,44.120165
2,SOANQFY12AB0183239,17.0,38.017503
3,SOKUPAO12AB018D576,19.0,35.54246
4,SOAXGDH12A8C13F8A1,10.0,33.986667
5,SOUVTSM12AC468F6A7,32.0,18.271571
6,SOTWNDJ12A8C143984,4.0,18.227967
7,SOPUCYA12A8C13A694,6.0,16.186765
8,SOPPROJ12AB0184E18,3.0,16.17127
9,SOXWYZP12AF72A42A6,4.0,15.671209


In [1050]:
pd.merge(df_listened[['song_id', 'listen_count', 'pred']],
         song_info, how='inner', on='song_id').reset_index(drop=True)

Unnamed: 0,song_id,listen_count,pred,artist_name,track_name
0,SOSXLTC12AF72A7F54,15.0,53.207586,Kings Of Leon,Revelry
1,SOOFYTN12A6D4F9B35,6.0,44.120165,Alliance Ethnik,Représente
2,SOANQFY12AB0183239,17.0,38.017503,Muse,Uprising
3,SOKUPAO12AB018D576,19.0,35.54246,Paramore,The Only Exception (Album Version)
4,SOAXGDH12A8C13F8A1,10.0,33.986667,Florence + The Machine,Dog Days Are Over (Radio Edit)
5,SOUVTSM12AC468F6A7,32.0,18.271571,Lil Wayne / Eminem,Drop The World
6,SOTWNDJ12A8C143984,4.0,18.227967,Train,Marry Me
7,SOPUCYA12A8C13A694,6.0,16.186765,Five Iron Frenzy,Canada
8,SOPPROJ12AB0184E18,3.0,16.17127,Kid Cudi / MGMT / Ratatat,Pursuit Of Happiness (nightmare)
9,SOXWYZP12AF72A42A6,4.0,15.671209,Florence + The Machine,Cosmic Love


In [1008]:
pd.merge(df_pred[['song_id', 'pred']],
         song_info, how='inner', on='song_id').reset_index(drop=True)

Unnamed: 0,song_id,pred,artist_name,track_name
0,SOBONKR12A58A7A7E0,15.160464,Dwight Yoakam,You're The One
1,SOOFYTN12A6D4F9B35,14.094696,Alliance Ethnik,Représente
2,SOUNZHU12A8AE47481,13.040409,Ron Carter,I CAN'T GET STARTED
3,SOMGIYR12AB0187973,10.410071,Panic At The Disco,Behind The Sea [Live In Chicago]
4,SOEGIYH12A6D4FC0E3,10.394962,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
5,SOPUCYA12A8C13A694,10.324873,Five Iron Frenzy,Canada
6,SOBOUPA12A6D4F81F1,10.087090,Alliance Ethnik,Sincerité Et Jalousie
7,SONNSYV12A8C146BEC,10.042086,Modest Mouse,Float On
8,SOTWNDJ12A8C143984,9.454203,Train,Marry Me
9,SOLRGVL12A8C143BC3,9.315622,La Roux,Bulletproof


In [1009]:
train_set.sort_values('listen_count',ascending = False)

Unnamed: 0,user,song,listen_count
5344,6a944bfe30ae8d6b873139e8305ae131f1607d5f,SOBONKR12A58A7A7E0,271
41529,7ffc14a55b6256c9fa73fc5c5761d210deb7f738,SOPUCYA12A8C13A694,199
42204,2cbbe7a3262dceea01bdc048b7a260de817c6436,SOPUCYA12A8C13A694,195
7510,d2ac268dd7f11d013c3ec96c6dc0937ba7fe731f,SOSXLTC12AF72A7F54,194
74691,410ffb81264fbd17afc68d2a671d22e1e463fc32,SOFCPOU12A8C13BF40,189
51524,4be305e02f4e72dad1b8ac78e630403543bab994,SOEGIYH12A6D4FC0E3,180
70320,6a8fb4968a5f3b0a5b3708258e2582f66001b15a,SOHTKMO12AB01843B0,165
92776,9b06094a34c6c421b86df33c879b8980d1038cd8,SOXWYZP12AF72A42A6,160
48597,a06317669f449e306a86ef0dcb6d53a073f6970c,SORAHAG12AB0182BD0,153
70817,4be305e02f4e72dad1b8ac78e630403543bab994,SOHTKMO12AB01843B0,145


In [887]:



# sim_songs_matrix_33(songs_33,data.T)
# for j in range(len(songs_33[0])):
#     print j, songs_33[0][j][0],songs_33[0][j][1]

sorted_test = sorted(test1, key=lambda tup: tup[1]) #[::-10]
sorted_test1 = sorted(test1, key=lambda tup: tup[1])[::-1][10]
# print sorted_test
# print sorted_test1
# print type(sorted_test)
sim_songs_matrix_33 = songs_sim_matrix_33(songs_33, data.T)

print sim_songs_matrix_33[1]

# print sim_songs_matrix_33.shape

listened_items = data[1].nonzero()[0]

aa = sim_songs_matrix_33[1,listened_items]
bb = sim_songs_matrix_33[1,listened_items].argsort()[::-1][0:10]
print sim_songs_matrix_33[1,listened_items].argsort()[::-1][0:10]

nb= listened_items[sim_songs_matrix_33[1,listened_items].argsort()[::-1][0:10]]
print nb
print data[1,nb]
print sim_songs_matrix_33[1,nb]

pred = (data[1,nb].dot(sim_songs_matrix_33[1,nb])/sum(sim_songs_matrix_33[1,nb]))
print pred

# bb = sim_songs_matrix_33[1,listened_items].argsort()[::-1][0:10]

# print aa
# print bb
# print sim_songs_matrix_33[1,bb]
# print sim_songs_matrix_33[1,0],sim_songs_matrix_33[1,5]
# print bb[0],bb[1]

# print listened_items
# print songs_33[0][0]
# print data.T.shape[0]

# items_for_comparision = listened_items[songs_33[0][listened_items].argsort()[::-1][1:10]]




def predict_user_song(data, user_idx,song_idx, K_neighbors):
    
    return



[0.         0.         0.         0.         0.         0.
 0.11009274 0.38706416 0.1492808  0.099211   0.52589397 0.
 0.         0.08532206 0.29276318 0.32531264 0.18588462 0.
 0.         0.         0.         0.18929515 0.23280467 0.
 0.         0.11254555 0.         0.         0.         0.
 0.01740221 0.15115692 0.         0.         0.30113825 0.03373719
 0.         0.31922435 0.07286421 0.07813215 0.         0.01706989
 0.07168326 0.         0.32633248 0.47629864 0.26313507 0.24603043
 0.36991906 0.06985408 0.         0.         0.27677936 0.
 0.41363093 0.02853713 0.         0.12315356 0.         0.
 0.26000054 0.38893532 0.         0.         0.         0.
 0.         0.25059937 0.38791776 0.34927051 0.08036496 0.27530062
 0.09637394 0.         0.25697272 0.11261024 0.38728307 0.
 0.         0.14391133 0.268133   0.         0.11210605 0.
 0.         0.         0.         0.         0.         0.34804223
 0.04301663 0.         0.         0.         0.         0.1929717
 0.      

In [774]:
listened_items = data[760].nonzero()[0]
print listened_items
print data[760,listened_items]
flip_users_dict[760]

[ 4 11 25 28 52 53 95]
[3. 4. 1. 1. 2. 1. 4.]


'7c160d10387a7d1b84a2f6f3f318037ed91e9feb'

In [775]:
songs_dict['SOUFPNI12A8C142D19']

89

In [776]:
flip_songs_dict = dict(zip(songs_dict.values(), songs_dict.keys()))
flip_users_dict = dict(zip(users_dict.values(), users_dict.keys()))
aab = [flip_songs_dict[i] for i in listened_items]
print aab
print flip_users_dict[760]

['SOERYLG12A6701F07F', 'SOXQYSC12A6310E908', 'SOXWYZP12AF72A42A6', 'SOPQLBY12A6310E992', 'SOEBOWM12AB017F279', 'SOCVTLJ12A6310F0FD', 'SOLLNTU12A6701CFDC']
7c160d10387a7d1b84a2f6f3f318037ed91e9feb


In [777]:
# def predict_user_topNsongs(data,user_id,song_sim_matrix, K, N, users_dict,songs_dict,baseline, already_listened=False):

song_sim_matrix = songs_sim_matrix_rb(songs_33, data.T)
    
top_recommendation,  top_listened = predict_user_topNsongs(data,'7c160d10387a7d1b84a2f6f3f318037ed91e9feb',
                                                          song_sim_matrix, 20, 10, users_dict, songs_dict,
                                                          baseline, False )



<type 'list'>




In [None]:


def song_details(top_recommendation,top_listened):
    
    return df_rec, df_listened

In [778]:
dic = {}
dic[4] = data[760,4]
print dic[4]

3.0


In [988]:
print test1
print songs_33[5]

[(2, 0.4266608045519489), (3, 0.06163785494295232), (4, 0.038392385746377176), (6, 0.051884588117956265), (8, 0.05073766360594587), (11, 0.047501812367058936), (12, 0.07549321196412948), (13, 0.678096363220239), (14, 0.07785576945727175), (15, 0.056750387680542476), (16, 0.040653578794302175), (17, 0.13994826302367874), (18, 0.1740736714844466), (19, 0.03218874507956537), (20, 0.03587976124341174), (21, 0.06275134940267874), (22, 0.06688328579638526), (23, 0.04306148386098603), (24, 0.05589922385354356), (25, 0.08244440353001885), (26, 0.042837502097885255), (28, 0.07405159157708006), (29, 0.06650875819643555), (30, 0.031189726564067245), (31, 0.07169009603798077), (32, 0.22447437138616128), (33, 0.04078488267972729), (34, 0.06122159677072335), (35, 0.27941900102767886), (36, 0.08442656007103978), (37, 0.06661187976452704), (38, 0.05126474223962003), (40, 0.08446062262098486), (41, 0.323581142805657), (44, 0.12030368190495686), (45, 0.04739527869610347), (46, 0.04348464801565394), (47,

In [779]:
print top_recommendation

['SOTWNDJ12A8C143984', 'SOIBLKQ12AB0183E85', 'SOFLJQZ12A6D4FADA6', 'SOKUPAO12AB018D576', 'SOMGIYR12AB0187973', 'SOOXJDU12A8AE47ECB', 'SOPPROJ12AB0184E18', 'SOAFTRR12AF72A8D4D', 'SOAKMDU12A8C1346A9', 'SOEGIYH12A6D4FC0E3']


In [780]:
print top_listened

['SOXQYSC12A6310E908', 'SOLLNTU12A6701CFDC', 'SOERYLG12A6701F07F', 'SOEBOWM12AB017F279', 'SOCVTLJ12A6310F0FD', 'SOXWYZP12AF72A42A6', 'SOPQLBY12A6310E992']


In [989]:
print udict['e4b426ac0d1cb8ec2daf5370b106e270a7dd5b06']
print data[214].nonzero()[0]
print test1[0:2]

K = 10

# def top_N_songs(data)

214
[ 3  4  5  8 11 15 16 18 22 24 25 31 34 37 38 40 45 47 51 53 54 62 63 65
 67 70 71 74 76 77 78 85 87 91 99]
[(2, 0.4266608045519489), (3, 0.06163785494295232)]
