In [185]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
import time
%matplotlib inline

#train_final.csv - the training set of interactions
train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

#tracks_final.csv - supplementary information about the items
tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

#playlists_final.csv - supplementary information about the users
playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

#target_playlists.csv - the set of target playlists that will receive recommendations
target_playlists = pd.read_csv('input/target_playlists.csv');

#target_tracks.csv - the set of target items (tracks) to be recommended
target_tracks = pd.read_csv('input/target_tracks.csv');


In [186]:
import time

In [184]:
#This step is not needed yet, will make ratings worse! 

def get_relevant_tracks():
    #Now we want to remove some redundant stuff. 

    #We will remove all songs which are not occurring more than 10 times in train_final
    #Nevertheless, we still want to keep all tracks which are in the target tracks.  

    popularity = train_final.groupby(by="track_id").playlist_id.nunique().to_frame()

    #remove index name
    popularity.reset_index(level = 0, inplace = True)

    #Rename the columns
    popularity.columns = ['track_id','occurrences']

    #Remove all targeted tracks - TESTED, working as expected
    tracks_relevant = popularity[~popularity['track_id'].isin(target_tracks['track_id'])]

    #Remove tracks occurring less than 10 times
    tracks_relevant = tracks_relevant[tracks_relevant['occurrences'] > 4]

    #Add the targeteted tracks back again
    tracks_relevant = pd.concat([tracks_relevant, target_tracks])

    return(tracks_relevant)

    print("Removed %s redundant tracks which occured less than 10 times." %(tracks_final-tracks_relevant))

tracks_relevant = get_relevant_tracks()

#Remove irrelevant tracks from train_final and tracks_final
train_final = train_final[train_final['track_id'].isin(tracks_relevant['track_id'])]

print("Train_final now contains %s interactions. " %(train_final.shape[0]))

tracks_final = tracks_final[tracks_final['track_id'].isin(tracks_relevant['track_id'])]

print("Tracks_final now contains %s tracks. "%(tracks_final.shape[0]))

Train_final now contains 945579 interactions. 
Tracks_final now contains 74542 tracks. 


In [187]:
#Now lets take a look at the tags.
tracks_final.head()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]"


In [188]:
### Translating all content ids into indexes.

#We need to create buckets for the playcount and duration. 
#Lets create buckets and a help function for the duration. 

n_duration_buckets = 3
def duration_to_bucket(duration, alternative = 2):
    if (alternative == 1):
        n_duration_buckets = 8
        if duration <= 0:
            print("Null duration reached bucket function. ")
            return None
        elif duration < 90000: #not a song
            return 1
        elif duration < 140000: #short song
            return 2
        elif duration < 220000: #radio song
            return 3
        elif duration < 340000: #normal song
            return 4
        elif duration < 480000: #long song
            return 5
        elif duration < 720000: #really long
            return 6
        elif duration < 1200000: #super long
            return 7
        elif duration >= 1200000: #mixtape/compilation
            return 8
    elif(alternative == 2):
        n_duration_buckets = 3
        if duration <= 0:
            print("Null duration reached bucket function. ")
            return None
        elif duration <= 150000: #very short
            return 1
        elif duration > 150000 and duration < 720000: #very long
            return 2
        elif duration >= 720000: #mixtape/compilation
            return 3
        else: 
            return 0
        

n_playcount_buckets = 7
def playcount_to_bucket(playcount):
    if playcount <= 0 or playcount is None:
        print("Null playcount reached bucket function. ")
        return None
    elif playcount < 254: #0,4 percentile not popular
        return 1
    elif playcount < 881: #0,6 perc: known
        return 2
    elif playcount < 1560: #0,7 popular
        return 3
    elif playcount < 2808: #0,8 very popular
        return 4
    elif playcount < 5900: #0,9 hits
        return 5
    elif playcount < 10494: #0,95 super hits
        return 6
    elif playcount >= 10494: # mega hits
        return 7


In [189]:
tracks_final['tags'].head()

content_to_index = {}
content_to_id = {}
content_counter = 0

#Lets translate the tags to indexes.
for row in tracks_final['tags']:
    tags = row.strip('[ ]').split(', ')
    for tag in tags:
        if len(tag) > 0: 
            tag = "t"+tag
            if not(tag in content_to_index):
                content_to_index[tag] = content_counter
                content_to_id[content_counter] = tag
                content_counter += 1;
                
#Lets translate album into indexes
albumcount = 0 # 27607
for album in tracks_final['album']:
    album = album.strip('[ ]')
    if album != None and album != "None" and len(album) > 0: #None should not be considered content
        album = "al"+album
        if album == "alNone":
            print(album)
        if not(album in content_to_index):
            content_to_index[album] = content_counter
            content_to_id[content_counter] = album
            content_counter += 1
            albumcount += 1

#Lets translate artist_id into indexes 
artistcount = 0 #17537
for artist in tracks_final['artist_id']:
    artist = str(artist)
    if artist != None and artist != "None" and len(artist) > 0: #None should not be considered content
        artist = "ar"+artist
        if not(artist in content_to_index):
            content_to_index[artist] = content_counter
            content_to_id[content_counter] = artist
            content_counter += 1
            artistcount += 1
        
"""
#Lets translate the duration buckets into indexes. 
for bucket in range(n_duration_buckets): 
    bucket = "d"+str(bucket+1)
    content_to_index[bucket] = content_counter
    content_to_id[content_counter] = bucket
    print("added %s" %(bucket))
    content_counter += 1

#Lets translate the playcount buckets into indexes. 
for playcount in range(n_playcount_buckets): 
    playcount = "p"+str(playcount+1)
    content_to_index[playcount] = content_counter
    content_to_id[content_counter] = playcount
    
    content_counter += 1


## Alternative 2: Just one content type per continous variable. 
#Fun thing to try: can I add all duration/playcounts in one col, normalizing from 0-1? 


content_to_index["duration"] = content_counter
content_to_id[content_counter] = "duration"
content_counter += 1

content_to_index["playcount"] = content_counter
content_to_id[content_counter] = "playcount"
content_counter += 1
"""

print(len(content_to_index))
print("%s albums. 27607 expected." %albumcount)
print("%s artists. 17537 expected." %artistcount)

77040
27604 albums. 27607 expected.
17536 artists. 17537 expected.


In [190]:
#If we translate each track_id to a track_index which will serve as matrix index, we can save a lot of time. 


#We need a way to get from track_id to index in O(1).
#Let's create a dictionary

track_to_id = {}
track_to_index = {}
track_ids = tracks_final['track_id']

counter = 0;
for track_id in tracks_final['track_id']:
    track_id = int(track_id)
    track_to_index[track_id] = counter
    track_to_id[counter] = track_id
    counter += 1;
    
#and a way to get from playlist_id to index in O(1)


playlist_to_index = {}
playlist_to_id = {}
counter = 0; 
for playlist_id in playlists_final['playlist_id']:
    playlist_id = int(playlist_id)
    playlist_to_index[playlist_id] = counter
    playlist_to_id[counter] = playlist_id
    counter += 1;
    
print("We have {} playlists with {} unique tracks with {} unique content types. ".format(len(playlist_to_index), len(track_to_index), len(content_to_index)))

We have 57561 playlists with 100000 unique tracks with 77040 unique content types. 


In [191]:
#Now we can create an Item Content Matrix. 

#ICM_all = np.zeros((len(tracks_indexes), len(tags_indexes)), int)
#ICM_all = sps.coo_matrix((len(track_to_index), len(content_to_index)), int)
#print(ICM_all.shape)


In [192]:
tracks_final[:10]

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]"
5,2256817,144,218000,2.0,[9],"[54087, 109806, 189631, 49166, 116712]"
6,2561768,928,223000,249.0,[26],"[50764, 4425, 11056, 205245, 81223]"
7,474864,928,193000,73.0,[22],"[205245, 81223, 11056, 267, 3982]"
8,1378455,928,304000,73.0,[22],"[11056, 205245, 81223, 189631, 84597]"
9,1523190,928,206000,10.0,[22],"[205245, 11056, 81223, 4425, 189631]"


In [193]:
#So let's fill the ICM with our data.
import math

def build_ICM():
    
    no_interactions = train_final.shape[0]
    
    tracks_matrix = tracks_final.as_matrix()
    rows = np.zeros((no_interactions,), dtype = int)
    cols = np.zeros((no_interactions,), dtype = int)
    val = np.zeros((no_interactions,), dtype = int)
    #val[i] = value of row[i] col[i]
    #val = []
    counter = 0
    starttime = time.time()
    lasttime = starttime
    trackno = 0
    addedalbums = {} #for testing
    addedartists = {} # for testing
    for track in tracks_matrix: 
        track_id, artist_id, duration, playcount, album, tags = np.split(track, 6)

        #Get track index
        track_index = track_to_index[int(track_id[0])]

        
        #add artist
        
        artist_index = content_to_index["ar"+str(artist_id[0])]
        addedartists[artist_index] = 1
        
        rows[counter] = track_index
        cols[counter] = artist_index
        val[counter] = 1
        counter += 1

        #add album
        album = album[0].strip("[ ]")

        if album != None and len(album) > 0 and not album == "None":
            album_index = content_to_index["al"+album]
            addedalbums[album_index] = 1 #testing
            
            rows[counter] = track_index
            cols[counter] = album_index
            val[counter] = 1
            counter += 1

        #add tags
        tags = tags[0].strip('[ ]').split(', ')

        for tag in tags: 
            if len(tag) > 0:
                tag = "t"+tag
                tag_index = content_to_index[tag]

                rows[counter] = track_index
                cols[counter] = tag_index
                val[counter] = 1
                
                counter+=1
        """
        ## ALT 1: Continuous variables in different content types. 
        
        #add duration
        duration = int(duration)
        if duration > 0:
            duration_bucket = duration_to_bucket(duration)
            if duration_bucket > 0:   
                duration_index = content_to_index["d"+str(duration_bucket)]

                rows[counter] = track_index
                cols[counter] = duration_index

                counter+=1
        
        #add playcount
        if playcount is not None and playcount != "None" and not math.isnan(playcount):
            playcount = int(playcount)
            if playcount > 0: 
                playcount_bucket = playcount_to_bucket(playcount)
                playcount_index = content_to_index["p"+str(playcount_bucket)]

                rows[counter] = track_index
                cols[counter] = playcount_index
                counter+=1
        """
        
        """
        ## ALT 2: Continuous variables in one content type. 
        
        #add duration
        duration = int(duration)
        if duration > 0:
            duration_bucket = duration_to_bucket(duration)
            duration_index = content_to_index["duration"]

            rows[counter] = track_index
            cols[counter] = duration_index
            val[counter] = duration_bucket/n_duration_buckets
            
            counter+=1

        #add playcount
        if playcount is not None and playcount != "None" and not math.isnan(playcount):
            playcount = int(playcount)
            if playcount > 0: 
                playcount_bucket = playcount_to_bucket(playcount)
                playcount_index = content_to_index["playcount"]

                rows[counter] = track_index
                cols[counter] = playcount_index
                val[counter] = playcount_bucket/n_playcount_buckets

                
                counter+=1
        """
        if trackno%5000 == 0:
            print("Track %s of %s. %s s sec." %(trackno, tracks_matrix.shape[0], round(time.time()-starttime, 2)))  
        trackno += 1

    #Implicit ratings: all ratings are 1.             
    
    rows = rows[:counter]
    cols = cols[:counter]
    val = val[:counter]
    #val = np.ones(rows.shape, dtype = int)

    #Build ICM matrix. 
    ICM_all = sps.coo_matrix((val, (rows, cols)), dtype = int)
    
    print("Built ICM matrix with %s content values." %(val.shape[0]))
    
    print("%s albums. 27607 expected." %len(addedalbums))
    print("%s artists. 17537 expected." %len(addedartists))
    
    return ICM_all


#Build new ICM
ICM_all = build_ICM()
print("Done!")



#Get old ICM

Track 0 of 100000. 0.0 s sec.
Track 5000 of 100000. 0.1 s sec.
Track 10000 of 100000. 0.22 s sec.
Track 15000 of 100000. 0.34 s sec.
Track 20000 of 100000. 0.46 s sec.
Track 25000 of 100000. 0.58 s sec.
Track 30000 of 100000. 0.69 s sec.
Track 35000 of 100000. 0.8 s sec.
Track 40000 of 100000. 0.92 s sec.
Track 45000 of 100000. 1.03 s sec.
Track 50000 of 100000. 1.15 s sec.
Track 55000 of 100000. 1.27 s sec.
Track 60000 of 100000. 1.39 s sec.
Track 65000 of 100000. 1.5 s sec.
Track 70000 of 100000. 1.61 s sec.
Track 75000 of 100000. 1.73 s sec.
Track 80000 of 100000. 1.84 s sec.
Track 85000 of 100000. 1.95 s sec.
Track 90000 of 100000. 2.08 s sec.
Track 95000 of 100000. 2.2 s sec.
Built ICM matrix with 656745 content values.
27604 albums. 27607 expected.
17536 artists. 17537 expected.
Done!


In [195]:
ICM_all.shape
"""
with open("output/content_tags.txt",'w') as f:
    for content in content_to_index: 
        f.write(content+"\n")
"""
print(content_to_index["alNone"])

KeyError: 'alNone'

In [196]:
#Save the ICM

sps.save_npz("Saved Matrixes/ICM_all_coo", ICM_all)
print("Saved ICM!")

Saved ICM!


In [197]:
#Let's convert to csr. 
ICM_all = ICM_all.tocsr()
print("Converted")

Converted


In [198]:
target_tracks.head()

Unnamed: 0,track_id
0,1316175
1,3885714
2,3091270
3,226759
4,230596


In [199]:
def get_target_item_filter(indices):
    target_filter = np.zeros((indices), dtype = bool)
    for track in target_tracks.values:
        track_id = track[0]
        track_index = track_to_index[track_id]
        target_filter[track_index] = True
    print("Created filter preserving %s out of %s " %(np.count_nonzero(target_filter),target_filter.shape[0]))
    return target_filter

In [200]:
a= np.array([1, 2, 3, 4])
f = [True, False, True, True]
a[f]

array([1, 3, 4])

In [201]:
a = sps.csr_matrix(np.random.rand(3,3))
a[0.5 >= a] = 0

#print(sps.csr_matrix(a.todense()))
print(a)

# Vi har en csr.

# om vi loopar igenom den och plockar bort noise, sedan skapar ny matrix. 


#print(sps.csr_matrix(a.toarray()))

  (0, 0)	0.0
  (0, 1)	0.619816950386
  (0, 2)	0.858783595106
  (1, 0)	0.649576808799
  (1, 1)	0.0
  (1, 2)	0.0
  (2, 0)	0.0
  (2, 1)	0.735067713547
  (2, 2)	0.88801706132




In [202]:
def build_URM(train_test_split = 0.80):
    #Builds urm 
    
    #train_test_split = 1
    
    numInteractions = train_final.shape[0]

    train_mask = np.random.choice(a = [True,False], size = numInteractions, p = [train_test_split, 1-train_test_split])
    
    playlistList = train_final['playlist_id'].values
    itemList = train_final['track_id'].values

    #Translate ids
    playlistList_translated = np.zeros(playlistList.shape)
    itemList_translated = np.zeros(itemList.shape)
    ratingList = np.ones((playlistList.shape), int)
    
    for i in range(train_final.shape[0]):
        playlistList_translated[i] = playlist_to_index[playlistList[i]]
        itemList_translated[i] = track_to_index[itemList[i]]
    print("Translated ids to indexes.")
    
    #Build URM matrix. 
    URM_train = sps.coo_matrix((ratingList[train_mask], (playlistList_translated[train_mask], itemList_translated[train_mask])))
    URM_train = URM_train.tocsr()
    print("Built URM_train with shape %s,%s" %(URM_train.shape[0],URM_train.shape[1]))
    
    if train_test_split < 1: 
        #Build URM_test
        test_mask = np.logical_not(train_mask)
        URM_test = sps.coo_matrix((ratingList[test_mask], (playlistList_translated[test_mask], itemList_translated[test_mask])))
        URM_test = URM_test.tocsr()
        print("Built URM_test")
        testsize = (test_mask[test_mask == True].shape[0])

    else: 
        URM_test = sps.csc_matrix((10, 10), dtype=np.int8)
        testsize = 0
    
    
    trainsize = train_mask[train_mask == True].shape[0]
    totsize = trainsize + testsize
    print("Total datapoints: %s. Expected: %s" %(totsize,numInteractions))

    
    print(URM_train.shape)
    print(URM_test.shape)
    
    return URM_train, URM_test

URM_train, URM_test = build_URM(0.8)

Translated ids to indexes.
Built URM_train with shape 57560,100000
Built URM_test
Total datapoints: 1040522. Expected: 1040522
(57560, 100000)
(57560, 100000)


In [203]:
#Testing the URM builder.
print("URM_train contains %s interactions. Expected 1040422" %URM_train.nnz)
testcount = 0
traincount = 0
itr = 10000
for playlist_id, track_id in train_final[0:itr].values: 
    if (URM_train[playlist_to_index[playlist_id],track_to_index[track_id]]) > 0: 
        #print("Playlist %s with index %s and track %s with index %s was not in URM_train." %(playlist_id, playlist_to_index[playlist_id],track_id, track_to_index[track_id]))
        traincount += 1
    elif (URM_test[playlist_to_index[playlist_id],track_to_index[track_id]]) > 0:
        testcount += 1
        
print("Train: %s. Test: %s"%(traincount/itr, testcount/itr))
    

URM_train contains 832279 interactions. Expected 1040422
Train: 0.7963. Test: 0.2037


In [204]:
#Evaluation functions

def precision(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def MAP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

def evaluate_algorithm(URM_test, recommendations, at=5):
    
    starttime = time.time()
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0
    
    num_eval = 0
    
    playlists = target_playlists['playlist_id']

    for i, playlist_id in enumerate(playlists):
        
        if i % 500 == 0:
            print("User %d of %d, %d sec." % (i, len(playlists), round(time.time()-starttime)))

        relevant_items = URM_test[playlist_to_index[playlist_id]].indices
        
        if len(relevant_items)>0:
            
            recommended_items = recommendations.iloc[i,1:6]
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_MAP += MAP(recommended_items, relevant_items)


    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval
    
    print("Recommender performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP))



In [205]:
import time
class Recommender(object):
    def __init__(self, URM, target_items, item_ids, k=50, shrinkage=100, similarity='cosine'):
        self.dataset = URM
        self.target_items = target_items
        self.target_item_filter = get_target_item_filter(tracks_final.shape[0])
        self.item_ids = item_ids
        self.k = k
        self.shrinkage = shrinkage
        self.similarity_name = similarity
        
        self.UIM = None
        
        if similarity == 'cosine':
            self.distance = Cosine(shrinkage=self.shrinkage)
        elif similarity == 'pearson':
            self.distance = Pearson(shrinkage=self.shrinkage)
        elif similarity == 'adj-cosine':
            self.distance = AdjustedCosine(shrinkage=self.shrinkage)
        else:
            raise NotImplementedError('Distance {} not implemented'.format(similarity))

    def __str__(self):
        return "Recommender(similarity={},k={},shrinkage={})".format(self.similarity_name, self.k, self.shrinkage)

    def fit(self, X):
        cp = time.time()
        #Calculate cosine similarity
        ISM = self.distance.compute(X) 
        print("Computed Item-Item similarity matrix. %s " %(time.time()-cp))
        cp = time.time()
        
        
        URM = check_matrix(self.dataset, 'csr')
        
        print("Converted URM to csc %s " %(time.time()-cp))
        cp = time.time()
        
        ISM = check_matrix(ISM, 'csr')
        print("Converted ISM to csc %s " %(time.time()-cp))
        cp = time.time()
        
        #Filter not targeted tracks. (U x tI). self.target_item_filter has tracks_final.shape[0] as length. 
        print("Nnz: %s" %(URM.nnz))
        
        print(URM.shape)
        print(self.target_item_filter.shape)
        #URM[:,self.target_item_filter.T] = 0
        #ISM[self.target_item_filter,self.target_item_filter.T] = 0
        
        print("Filtered not target tracks. %" %(time.time()-cp))
        print("Nnz: %s" %(URM.nnz))
        
        
        
        #Multiply URM * ISM (U x I * I x I = U x I)
        print(URM.shape)
        print(ISM.shape)
        
        #UIM = URM * ISM
        #print("Computed UIM. %s" %(time.time()-cp))
        #cp = time.time()
        
        

        
        cp = time.time()
        
        
        
        #This U x tI UIM can now be used by the recommender function. 
        
        self.W_sparse = UIM

    def fit_new(self, X, noise = 0.1):
        
        ## GET ISM MATRIX (I X I)
        cp = time.time()
        #Calculate cosine similarity
        print("Lets compute distance.")
        ISM = self.distance.compute(X) 
        print("Computed Item-Item similarity matrix. %s " %(time.time()-cp))
        cp = time.time()
        
        ##GET URM (U X I)
        
        URM = self.dataset
        
        ## GET item_ids (1 x I)
        
        #self.item_ids
        
        ## FILTER item_ids INTO target_item_ids (1 x tI)
        
        self.target_item_ids = track_ids[self.target_item_filter]
        print(URM.nnz)
        print(ISM.nnz)
        
        ## FILTER TARGETED TRACKS
        #Maybe this is not working as expected - are we filtering the right tracks? 
        
        ISM = ISM[:,self.target_item_filter]
        print("Filtered targeted tracks in ISM. %s " %(time.time()-cp))
        cp = time.time()
        
        #self.ISM = sps.csr_matrix(self.ISM)
        
        cp = time.time()  
        print(URM.nnz)
        #ISM = sps.csr_matrix(ISM)
        print(ISM.nnz)
        
        ## CONVERT URM TO CSR
        URM = check_matrix(URM, 'csr')
        print("Checked URM csr %s " %(time.time()-cp))
        cp = time.time()
        
        ##Print dimension
        print(URM.shape)
        print(ISM.shape)
        
        ## MULTIPLY URM (U x I) * ISM (I x I)
        UIM = URM.dot(ISM)
        print("Computed URM * ISM %s " %(time.time()-cp))
        cp = time.time()
        

        
        ## MAKE NOT SPARSE
        #UIM_dense = UIM.todense()
        
        ## FILTER UIM into (U x tI) (not needed since I already filtered!)
        #UIM_dense = UIM_dense[:,self.target_item_filter]
        
        ## THIS IS OUR FITTED MODEL
        self.UIM = UIM
        
        return self.UIM

        
    def recommend_new(self, user_id, at = 5):
        ## GET USER_INDEX
        user_index = playlist_to_index[user_id]
        
        # Convert to np.array (why wasn't it before?!)
        self.target_item_ids = np.array(self.target_item_ids)
        
        ## GET ROW CORRESPONDING TO USER (1 x tI)
        user_weights = self.UIM[user_index,:].toarray()
             
        ## ARGSORT BASED ON AXIS = 0, GET [1,0:at]
        top_indexes = np.argsort(user_weights)#[-at:]
        top_k_indexes = top_indexes[0, -at:]

        ## Translate to indexes
        recommendations = self.target_item_ids[top_k_indexes]
        
        ## RETURN RECOMMENDATIONS
        return(recommendations)
    
    def recommend_dev(self, user_id, at = 5):
        print("Recommend %s items for user %s!" %(at, user_id))
        ## GET USER_INDEX
        user_index = playlist_to_index[user_id]
        
        # Convert to np.array (why wasn't it before?!)
        self.target_item_ids = np.array(self.target_item_ids)
        
        ## GET ROW CORRESPONDING TO USER (1 x tI)
        user_weights = self.UIM[user_index,:].toarray()
             
        ## ARGSORT BASED ON AXIS = 0, GET [1,0:at]
        top_indexes = np.argsort(user_weights)#[-at:]
        print(top_indexes.shape)
        top_k_indexes = top_indexes[0, -at:]
        print(top_k_indexes.shape)

        ## Translate to indexes
        recommendations = self.target_item_ids[top_k_indexes]
        
        ## RETURN RECOMMENDATIONS
        return(recommendations)
        
        
        

    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.dataset[user_id]
        print("User profile: %s" %(user_profile))
        scores = user_profile.dot(self.W_sparse).toarray().ravel()
        print("Scores: %s" %(scores))
        # rank items
        ranking = scores.argsort()[::-1]
        if exclude_seen:
            ranking = self._filter_seen(user_id, ranking)
        
        print("Ranking: %s" %(ranking))
        
        export = [0,0,0,0,0]
        for i in range(5):
            t_id = track_to_id[ranking[i]]
            export[i] = t_id
            
        return export
    def _filter_seen(self, user_id, ranking):
        user_profile = self.dataset[user_id]
        seen = user_profile.indices
        unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
        return ranking[unseen_mask]

print("asd")

asd


In [206]:
##TESTING


user_weights = [1,2,4,3,7,6,1]
target_item_ids = np.array([1,2,4,3,7,6,1])
at = 3
## ARGSORT BASED ON AXIS = 0, GET [1,0:at]
top_indexes = np.argsort(user_weights)[-at:]
print(top_indexes)
## Translate to indexes
recommendations = target_item_ids[top_indexes]
print(recommendations)

[2 5 4]
[4 6 7]


In [207]:
target_item_filter = get_target_item_filter(tracks_final.shape[0])
print(target_item_filter.shape)
print(track_ids.shape)
target_item_ids = track_ids[target_item_filter]

#print(target_item_ids)

Created filter preserving 32195 out of 100000 
(100000,)
(100000,)


In [208]:
def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)

In [209]:
import scipy
from sklearn.metrics.pairwise import cosine_similarity

class ISimilarity(object):
    """Abstract interface for the similarity metrics"""

    def __init__(self, shrinkage=10):
        self.shrinkage = shrinkage

    def compute(self, X):
        pass


class Cosine(ISimilarity):
    def compute(self, X):
        # convert to csc matrix for faster column-wise operations
        X = check_matrix(X, 'csc', dtype=np.float32)
        print("Converted to csc.")
        # 1) normalize the columns in X
        # compute the column-wise norm
        # NOTE: this is slightly inefficient. We must copy X to compute the column norms.
        # A faster solution is to  normalize the matrix inplace with a Cython function.
        Xsq = X.copy()
        Xsq.data **= 2
        norm = np.sqrt(Xsq.sum(axis=0))
        norm = np.asarray(norm).ravel()
        norm += 1e-6
        # compute the number of non-zeros in each column
        # NOTE: this works only if X is instance of sparse.csc_matrix
        col_nnz = np.diff(X.indptr)
        # then normalize the values in each column
        X.data /= np.repeat(norm, col_nnz)
        print("Normalized")

        # 2) compute the cosine similarity using the dot-product
        dist = X * X.T
        print("Computed")
        
        # zero out diagonal values
        dist = dist - sps.dia_matrix((dist.diagonal()[scipy.newaxis, :], [0]), shape=dist.shape)
        print("Removed diagonal")
        
        # and apply the shrinkage
        if self.shrinkage > 0:
            dist = self.apply_shrinkage(X, dist)
            print("Applied shrinkage")    
        
        return dist

    def apply_shrinkage(self, X, dist):
        # create an "indicator" version of X (i.e. replace values in X with ones)
        X_ind = X.copy()
        X_ind.data = np.ones_like(X_ind.data)
        # compute the co-rated counts
        co_counts = X_ind * X_ind.T
        # remove the diagonal
        co_counts = co_counts - sps.dia_matrix((co_counts.diagonal()[scipy.newaxis, :], [0]), shape=co_counts.shape)
        # compute the shrinkage factor as co_counts_ij / (co_counts_ij + shrinkage)
        # then multiply dist with it
        co_counts_shrink = co_counts.copy()
        co_counts_shrink.data += self.shrinkage
        co_counts.data /= co_counts_shrink.data
        dist.data *= co_counts.data
        return dist
    
    def remove_noise(self, X, noise):
        X = check_matrix(X, 'csc', dtype=np.float32)
        i = 0
        for row in X:
            r = row
            row[row > noise] = 1
            row[row <= noise] = 0

            X[i,:] = r[row]
            i += 1
        return X


In [210]:
def kkeep_k_largest(X, k):
    
    M = X.todense()
    for row in M: 
        top_k_idx = np.argsort(row)
        print(row)
        print(row[0,top_k_idx[0,-k]])
        
    
    
a = sps.csr_matrix(np.random.rand(3,3))
print(kkeep_k_largest(a,2))

[[ 0.64419709  0.48003901  0.67558476]]
0.644197087836
[[ 0.37399985  0.04801194  0.75656208]]
0.373999853518
[[ 0.57302005  0.36191736  0.4023458 ]]
0.402345796465
None


In [211]:
#TODO: Can we create a "better" URM by matrix factorization? 

In [212]:
#This is the main script! 


#1. Fitting the model. 

#If export is true, the recommendations will be written to file. 
#If false, evaluation method can be used. 
export = True

if export:
    train_rate = 1
else:
    train_rate = 0.8
print("Running with train_rate %s" %(train_rate))

URM_train, URM_test = build_URM(train_rate)

import time
starttime = time.time()
rec = Recommender(URM=URM_train, target_items = target_tracks, item_ids = track_ids, shrinkage=30.0)
#ICM_idf = ICM_add_IDF(ICM_all)
ISM = rec.fit_new(ICM_all) ##Saving outside for quicker restarts. 
#rec.fit_bad(ICM_all, k = 2000)
print("Fitted in %s seconds" %(time.time()-starttime))


Running with train_rate 1
Translated ids to indexes.
Built URM_train with shape 57560,100000
Total datapoints: 1040522. Expected: 1040522
(57560, 100000)
(10, 10)
Created filter preserving 32195 out of 100000 
Lets compute distance.
Converted to csc.
Normalized
Computed
Removed diagonal
Applied shrinkage
Computed Item-Item similarity matrix. 339.1293377876282 
1040522
1731644528
Filtered targeted tracks in ISM. 56.400501012802124 
1040522
608798909
Checked URM csr 0.09475302696228027 
(57560, 100000)
(100000, 32195)
Computed URM * ISM 75.79554510116577 
Fitted in 472.06158900260925 seconds


In [213]:
#2. Creating recommendations. 
zeros = np.zeros((target_playlists.size, 6), dtype = int)
recommendations = pd.DataFrame(zeros)
recommendations.columns = ['playlist_id', 1, 2, 3, 4, 5]

counter = 0
starttime = time.time()
for playlist_id in target_playlists['playlist_id']:

    if counter % 1000 == 0: 
        print ("%s out of 10000 playlists, %s sec." %(counter, time.time()-starttime))

    playlist_id_translated = playlist_to_index[int(playlist_id)]
    recommendations.iloc[counter, 1:6] = rec.recommend_new(playlist_id, 5)
    recommendations.iloc[counter, 0] = playlist_id
    counter += 1

if export:
    filename = "recommendations_6/11_"
    np.savetxt("output/recommendations_20nov_4.csv",recommendations, fmt = '%s,%s %s %s %s %s', header = "playlist_id,track_ids", newline = "\n")
    print("Saved to file: ")

#print(recommendations)
print("Done")

0 out of 10000 playlists, 0.002023935317993164 sec.
1000 out of 10000 playlists, 2.344505786895752 sec.
2000 out of 10000 playlists, 4.498498916625977 sec.
3000 out of 10000 playlists, 6.7699198722839355 sec.
4000 out of 10000 playlists, 9.017293930053711 sec.
5000 out of 10000 playlists, 11.37050175666809 sec.
6000 out of 10000 playlists, 13.708664894104004 sec.
7000 out of 10000 playlists, 16.11584782600403 sec.
8000 out of 10000 playlists, 18.46193289756775 sec.
9000 out of 10000 playlists, 20.63483476638794 sec.
Saved to file: 
Done


In [99]:
# 3. Want to evaluate? 
print(URM_test.shape)
print(URM_train.shape)

evaluate_algorithm(URM_test, recommendations)

(57556, 74541)
(57560, 74542)
User 0 of 10000, 0 sec.
User 500 of 10000, 0 sec.
User 1000 of 10000, 1 sec.
User 1500 of 10000, 1 sec.
User 2000 of 10000, 1 sec.
User 2500 of 10000, 1 sec.
User 3000 of 10000, 1 sec.
User 3500 of 10000, 2 sec.
User 4000 of 10000, 2 sec.
User 4500 of 10000, 2 sec.
User 5000 of 10000, 2 sec.


IndexError: index (57559) out of range

In [214]:
# Does the recommender rec just targeted tracks? 
def test_all_rec_in_target(recommendations):
    tt = target_tracks.values
    recommendations = recommendations.as_matrix()
    notcount = 0
    count = 0
    for row in recommendations: 
        for item in row[1:6]: 
            count += 1
            if item not in tt: 
                notcount += 1
                #print("Rec not in target! %s" %item)
    print("%s out of %s were not in the target." %(notcount, count))
    
test_all_rec_in_target(recommendations)

0 out of 50000 were not in the target.


In [None]:
## TESTING THE REC FUNCTION - SHOULD WORK
#Fitted in 172.8 seconds

rec_dev = Recommender(URM=URM_train, target_items = target_tracks, item_ids = track_ids, shrinkage=0.0)
rec_dev.UIM = rec.UIM
rec_dev.target_item_ids = rec.target_item_ids

zeros = np.zeros((1, 6), dtype = int)
recommendations = pd.DataFrame(zeros)
recommendations.columns = ['playlist_id', 1, 2, 3, 4, 5]
recommendations.iloc[counter, 1:6] = rec_dev.recommend_dev(playlist_to_id[30680], 5)
recommendations.iloc[counter, 0] = playlist_to_id[30680]

print(recommendations)



In [None]:
import time
starttime = time.time()
rec = Recommender(URM=URM_train, shrinkage=0.0)
rec.fit(ICM_all)
print("Done in %s seconds" %(time.time()-starttime))

In [31]:
def ICM_add_IDF(ICM): 
    num_tot_items = ICM_all.shape[0]

    # let's count how many items have a certain feature
    items_per_feature = (ICM_all > 0).sum(axis=0)

    IDF = np.array(np.log(num_tot_items / items_per_feature))[0]

    print(ICM_all.shape)
    print(IDF.shape)
    ICM_idf = sps.csr_matrix(ICM_all, dtype=np.float64)
    # compute the number of non-zeros in each col
    # NOTE: this works only if X is instance of sparse.csc_matrix
    col_nnz = np.diff(check_matrix(ICM_idf, 'csc').indptr)
    print(col_nnz.shape)
    print(ICM_idf.shape)
    print(IDF.shape)
    # then normalize the values in each col
    ICM_idf.data *= np.repeat(IDF, col_nnz)
    return ICM_idf

In [None]:
rec_idf = BasicItemKNNRecommender(URM=URM_train, shrinkage=0.0, k=50)
rec_idf.fit(ICM_idf)

In [None]:
evaluate_algorithm(URM_test, rec)

In [None]:
def save_to_file():
    #Saves the recommendations dataframe to the .csv-file. 
    np.savetxt("output/recommendations_more_content.csv",recommendations, fmt = '%s,%s %s %s %s %s', header = "playlist_id,track_ids", newline = "\n")
    
    
def test():
    #Do something
    print("Result: ")
    pass


save_to_file()
print(recommendations.head)