In [85]:
import scipy.io
from scipy.sparse import csr_matrix
import numpy
import math
import pandas

## Train Valid Split

In [126]:
tripletable= pandas.read_csv('train_triplets.txt',header=None,sep='\t')

In [127]:
tripletable.head()

Unnamed: 0,0,1,2
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [186]:
users = tripletable.iloc[:,0].unique()

In [187]:
users

array(['b80344d063b5ccb3212f76538f3d9e43d87dca9e',
       '85c1f87fea955d09b4bec2e36aee110927aedf9a',
       'bd4c6e843f00bd476847fb75c47b4fb430a06856', ...,
       'ca99d94daa9d5231643a08aac2f3bfb645e73b09',
       'cf8289419383259189afe6bb50c5115fd84f1064',
       'b7815dbb206eb2831ce0fe040d0aa537e2e800f7'], dtype=object)

In [188]:
numpy.random.shuffle(users)

In [189]:
users

array(['5aa89846fc956c7b9182dc0be83e4cf5d7c4f15c',
       '710f432a110099dc00385340df6c3ad1660b939a',
       'c273b31c03021e2278e207fdf40dfaf0869224d7', ...,
       '8999f199f8d04eb4f631f8653d563f6dcb831747',
       '514cb3f217fea724f37300e8dd7eb2ac08cd5788',
       'fcdb6fb8c555352b1e5e2edfde507deaccf77f70'], dtype=object)

In [190]:
train_split = int(len(users)*0.6)
valid_split = train_split+int(len(users)*0.2)

In [191]:
train_users = users[0:train_split]
valid_users = users[train_split:valid_split]
test_users = users[valid_split:]

In [203]:
train_data = tripletable[tripletable[0].isin(train_users)]

In [204]:
valid_data = tripletable[tripletable[0].isin(valid_users)]

In [205]:
test_data = tripletable[tripletable[0].isin(test_users)]

In [211]:
train_data.to_csv('train_data.txt',sep='\t',header=False,index=False)
valid_data.to_csv('valid_data.txt',sep='\t',header=False,index=False)
test_data.to_csv('test_data.txt',sep='\t',header=False,index=False)

In [213]:
valid_data.index=range(valid_data.shape[0])

In [215]:
valid_data.head()

Unnamed: 0,0,1,2
0,e006b1a48f466bf59feefed32bec6494495a4436,SOAUWYT12A81C206F1,2
1,e006b1a48f466bf59feefed32bec6494495a4436,SOAXGDH12A8C13F8A1,2
2,e006b1a48f466bf59feefed32bec6494495a4436,SOBFMHC12A6D4F9401,1
3,e006b1a48f466bf59feefed32bec6494495a4436,SOBONKR12A58A7A7E0,2
4,e006b1a48f466bf59feefed32bec6494495a4436,SOBTRCD12A6701E976,2


In [216]:
valid_visible = valid_data[valid_data.index%2==0]

In [217]:
valid_predict = valid_data[valid_data.index%2!=0]

In [221]:
valid_visible.to_csv('valid_visible.txt',sep='\t',header=False,index=False)

In [222]:
valid_predict.to_csv('valid_predict.txt',sep='\t',header=False,index=False)

## create a mapping from a song ID to the number of times this song appears

In [63]:
def song2count(filename):
    f = open(filename, 'r')
    song_to_count = dict()
    for line in f:
        _, song, _ = line.strip().split('\t')
        if song in song_to_count:
            song_to_count[song] += 1
        else:
            song_to_count[song] = 1
    f.close()
    return song_to_count

In [64]:
song_to_count=count = song2count('train_triplets.txt')

KeyboardInterrupt: 

## re-order the songs by decreasing popularity

In [6]:
songs_ordered = sorted(song_to_count.keys(),key=lambda s: song_to_count[s],reverse=True)

## create a mapping from a users to songs

In [223]:
def user2song(filename):
    f = open(filename, 'r')
    user_to_songs = dict()
    for line in f:
        user, song, _ = line.strip().split('\t')
        if user in user_to_songs:
            user_to_songs[user].add(song)
        else:
            user_to_songs[user] = set([song])
    f.close()
    return user_to_songs

In [224]:
u2s= user2song('train_data.txt')

In [225]:
validu2s = user2song('valid_visible.txt')

In [228]:
valid_predict=user2song('valid_predict.txt')

## get user list

In [231]:
users = validu2s.keys()

## User Based Recommendation Score Calculation

In [232]:
def user_based_score(user_songs,u2s_matrix,alpha,Q):
        s_scores={}
        for user in u2s_matrix:
            if not user in u2s_matrix:
                continue
            w=float(len(u2s_matrix[user]&user_songs))
            if w>0:
                l1=len(user_songs)
                l2=len(u2s_matrix[user])
                w/=(math.pow(l1,alpha)*(math.pow(l2,(1.0-alpha))))
                w=math.pow(w,Q)     
            for s in u2s_matrix[user]:
                if s in s_scores:
                    s_scores[s]+=w
                else:
                    s_scores[s]=w
        return s_scores

## Create User-based recommendation list

In [234]:
l_rec_songs=[]
i=0
tau=500
for user in users:
    s_scores = user_based_score(validu2s[user],u2s,0.15,3)
    songs_ordered = sorted(s_scores.keys(),key=lambda s: s_scores[s],reverse=True)
    cleaned_songs = []
    for x in songs_ordered:
        if len(cleaned_songs)>=tau:
            break
        if x not in validu2s[user]:
            cleaned_songs.append(x)
    l_rec_songs.append(cleaned_songs)
    if i%1000==0:
        print i
    i+=1


0
1000
2000


KeyboardInterrupt: 

In [239]:
len(l_rec_songs)

2051

## Pricision Measurement

In [6]:
#l_rec: list of recommended songs
#u2s: mapping users to songs
#tau: 500
def AP(l_rec, sMu, tau):

    np=len(sMu)
    #print "np:", np
    nc=0.0
    mapr_user=0.0
    for j,s in enumerate(l_rec):
        if j>=tau:
            break
        if s in sMu:
        #print "s in sMu"
            nc+=1.0
            mapr_user+=nc/(j+1)
    mapr_user/=min(np,tau)
    return mapr_user

In [7]:
#l_users: list of users
#l_rec_songs: list of lists, recommended songs for users
#u2s: mapping users to songs
#tau: 500
def mAP(l_users, l_rec_songs, u2s, tau):
    mapr=0
    n_users=len(l_users)
    for i,l_rec in enumerate(l_rec_songs):
        if not l_users[i] in u2s:
            continue
        mapr+=AP(l_rec,u2s[l_users[i]], tau)
    return mapr/n_users