In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
import time
%matplotlib inline

#train_final.csv - the training set of interactions
train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

#tracks_final.csv - supplementary information about the items
tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

#playlists_final.csv - supplementary information about the users
playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

#target_playlists.csv - the set of target playlists that will receive recommendations
target_playlists = pd.read_csv('input/target_playlists.csv');

#target_tracks.csv - the set of target items (tracks) to be recommended
target_tracks = pd.read_csv('input/target_tracks.csv');
print("Loaded")

Loaded


In [13]:
albums = {}
for album in tracks_final['album']:
    album = album.strip('[ ]')
    if album != None and album != "None" and len(album) > 0: #None should not be considered content
        albums[album] = 1

print(len(albums))


27604


In [14]:
tracks_final['tags'].head()

content_to_index = {}
content_to_id = {}
content_counter = 0

#Lets translate the tags to indexes.
for row in tracks_final['tags']:
    tags = row.strip('[ ]').split(', ')
    for tag in tags:
        if len(tag) > 0: 
            tag = "ta"+tag
            if not(tag in content_to_index):
                content_to_index[tag] = content_counter
                content_to_id[content_counter] = tag
                content_counter += 1;

#Lets translate album into indexes
albumcount = 0 # 27604
for album in tracks_final['album']:
    album = album.strip('[ ]')
    if album != None and album != "None" and len(album) > 0: #None should not be considered content
        album = "al"+album
        if album == "alNone":
            print(album)
        if not(album in content_to_index):
            content_to_index[album] = content_counter
            content_to_id[content_counter] = album
            content_counter += 1
            albumcount += 1
#Lets translate artist_id into indexes 
artistcount = 0 #17537
for artist in tracks_final['artist_id']:
    artist = str(artist)
    if artist != None and artist != "None" and len(artist) > 0: #None should not be considered content
        artist = "ar"+artist
        if not(artist in content_to_index):
            content_to_index[artist] = content_counter
            content_to_id[content_counter] = artist
            content_counter += 1
            artistcount += 1
            
print(len(content_to_index))
print("%s albums. 27604 expected." %albumcount)
print("%s artists. 17537 expected." %artistcount)

77040
27604 albums. 27604 expected.
17536 artists. 17537 expected.


In [15]:
#If we translate each track_id to a track_index which will serve as matrix index, we can save a lot of time. 


#We need a way to get from track_id to index in O(1).
#Let's create a dictionary

track_to_id = {}
track_to_index = {}
track_ids = tracks_final['track_id']

counter = 0;
for track_id in tracks_final['track_id']:
    track_id = int(track_id)
    track_to_index[track_id] = counter
    track_to_id[counter] = track_id
    counter += 1;
    
#and a way to get from playlist_id to index in O(1)


playlist_to_index = {}
playlist_to_id = {}
counter = 0; 
for playlist_id in playlists_final['playlist_id']:
    playlist_id = int(playlist_id)
    playlist_to_index[playlist_id] = counter
    playlist_to_id[counter] = playlist_id
    counter += 1;
    
print("We have {} playlists with {} unique tracks with {} unique content types. ".format(len(playlist_to_index), len(track_to_index), len(content_to_index)))

We have 57561 playlists with 100000 unique tracks with 77040 unique content types. 


In [16]:
#So let's fill the ICM with our data.
import math

def build_ICM():
    
    no_interactions = train_final.shape[0]
    
    tracks_matrix = tracks_final.as_matrix()
    rows = np.zeros((no_interactions,), dtype = int)
    cols = np.zeros((no_interactions,), dtype = int)
    val = np.zeros((no_interactions,), dtype = int)
    #val[i] = value of row[i] col[i]
    #val = []
    counter = 0
    starttime = time.time()
    lasttime = starttime
    trackno = 0
    addedalbums = {} #for testing
    addedartists = {} # for testing
    for track in tracks_matrix: 
        track_id, artist_id, duration, playcount, album, tags = np.split(track, 6)

        #Get track index
        track_index = track_to_index[int(track_id[0])]

        
        #add artist
        
        artist_index = content_to_index["ar"+str(artist_id[0])]
        addedartists[artist_index] = 1
        
        rows[counter] = track_index
        cols[counter] = artist_index
        val[counter] = 1
        counter += 1

        #add album
        album = album[0].strip("[ ]")

        if album != None and len(album) > 0 and not album == "None":
            album_index = content_to_index["al"+album]
            addedalbums[album_index] = 1 #testing
            
            rows[counter] = track_index
            cols[counter] = album_index
            val[counter] = 1
            counter += 1

        #add tags
        tags = tags[0].strip('[ ]').split(', ')

        for tag in tags: 
            if len(tag) > 0:
                tag = "ta"+tag
                tag_index = content_to_index[tag]

                rows[counter] = track_index
                cols[counter] = tag_index
                val[counter] = 1
                
                counter+=1
                
        if trackno%5000 == 0:
            print("Track %s of %s. %s s sec." %(trackno, tracks_matrix.shape[0], round(time.time()-starttime, 2)))  
        trackno += 1
    
    rows = rows[:counter]
    cols = cols[:counter]
    val = val[:counter]
    
    print(rows[counter:])
    
    print(cols[counter:])
    
    print(val[counter:])
    #val = np.ones(rows.shape, dtype = int)

    #Build ICM matrix. 
    ICM_all = sps.coo_matrix((val, (rows, cols)), dtype = int)
    
    print("Built ICM matrix with %s content values." %(val.shape[0]))
    
    print("%s albums. 27604 expected." %len(addedalbums))
    print("%s artists. 17537 expected." %len(addedartists))
    
    return ICM_all


#Build new ICM
ICM_all = build_ICM()
print("Done!")

Track 0 of 100000. 0.0 s sec.
Track 5000 of 100000. 0.12 s sec.
Track 10000 of 100000. 0.24 s sec.
Track 15000 of 100000. 0.36 s sec.
Track 20000 of 100000. 0.49 s sec.
Track 25000 of 100000. 0.6 s sec.
Track 30000 of 100000. 0.72 s sec.
Track 35000 of 100000. 0.83 s sec.
Track 40000 of 100000. 0.94 s sec.
Track 45000 of 100000. 1.05 s sec.
Track 50000 of 100000. 1.16 s sec.
Track 55000 of 100000. 1.28 s sec.
Track 60000 of 100000. 1.4 s sec.
Track 65000 of 100000. 1.52 s sec.
Track 70000 of 100000. 1.63 s sec.
Track 75000 of 100000. 1.74 s sec.
Track 80000 of 100000. 1.86 s sec.
Track 85000 of 100000. 1.97 s sec.
Track 90000 of 100000. 2.09 s sec.
Track 95000 of 100000. 2.2 s sec.
[]
[]
[]
Built ICM matrix with 656745 content values.
27604 albums. 27604 expected.
17536 artists. 17537 expected.
Done!


In [17]:
def get_target_item_filter(indices):
    target_filter = np.zeros((indices), dtype = bool)
    for track in target_tracks.values:
        track_id = track[0]
        track_index = track_to_index[track_id]
        target_filter[track_index] = True
    print("Created filter preserving %s out of %s " %(np.count_nonzero(target_filter),target_filter.shape[0]))
    return target_filter

In [18]:
def build_URM(train_test_split = 0.80):
    #Builds urm 
    
    #train_test_split = 1
    
    numInteractions = train_final.shape[0]

    train_mask = np.random.choice(a = [True,False], size = numInteractions, p = [train_test_split, 1-train_test_split])
    
    playlistList = train_final['playlist_id'].values
    itemList = train_final['track_id'].values

    #Translate ids
    playlistList_translated = np.zeros(playlistList.shape)
    itemList_translated = np.zeros(itemList.shape)
    ratingList = np.ones((playlistList.shape), int)
    
    tru = train_mask[train_mask == True].shape[0]
    fal = (train_mask[train_mask == False].shape[0])
    
    print("True: %s. False: %s. Tot: %s" %(tru, fal, (tru+fal)))

    
    for i in range(train_final.shape[0]):
        playlistList_translated[i] = playlist_to_index[playlistList[i]]
        itemList_translated[i] = track_to_index[itemList[i]]
    #print("Translated ids to indexes.")
    
    #Build URM matrix. 
    URM_train = sps.coo_matrix((ratingList[train_mask], (playlistList_translated[train_mask], itemList_translated[train_mask])))
    URM_train = URM_train.tocsr()
    #print("Built URM_train with shape %s,%s" %(URM_train.shape[0],URM_train.shape[1]))
    
    if train_test_split < 1: 
        #Build URM_test
        test_mask = np.logical_not(train_mask)
        URM_test = sps.coo_matrix((ratingList[test_mask], (playlistList_translated[test_mask], itemList_translated[test_mask])))
        URM_test = URM_test.tocsr()
        print("Built URM_test")
        testsize = (test_mask[test_mask == True].shape[0])

    else: 
        URM_test = sps.csc_matrix((10, 10), dtype=np.int8)
        testsize = 0
    
    
    trainsize = train_mask[train_mask == True].shape[0]
    totsize = trainsize + testsize
    print("Total datapoints: %s. Expected: %s" %(totsize,numInteractions))

    
    print(URM_train.shape)
    print(URM_test.shape)
    
    return URM_train, URM_test

URM_train, URM_test = build_URM(0.8)

#Problem: The number of true/false values is not consistent.. Gives problems when testing. 

True: 832101. False: 208421. Tot: 1040522
Built URM_test
Total datapoints: 1040522. Expected: 1040522
(57560, 100000)
(57560, 99999)


In [20]:
URM_train, URM_test = build_URM(0.8)
print(URM_train.nnz)

True: 832263. False: 208259. Tot: 1040522
Built URM_test
Total datapoints: 1040522. Expected: 1040522
(57560, 100000)
(57560, 100000)
832263


In [None]:
import time
class Recommender(object):
    def __init__(self, URM, target_items, item_ids, k=50, shrinkage=100, similarity='cosine', filter_method = 'content', topK = 100):
        self.dataset = URM
        self.target_items = target_items
        self.target_item_filter = get_target_item_filter(tracks_final.shape[0])
        self.item_ids = item_ids
        self.k = k
        self.shrinkage = shrinkage
        self.similarity_name = similarity
        self.filter_method = filter_method
        self.topK = topK
        
        self.UIM = None
        
        if similarity == 'cosine':
            self.distance = Cosine(shrinkage=self.shrinkage)
        elif similarity == 'pearson':
            self.distance = Pearson(shrinkage=self.shrinkage)
        elif similarity == 'adj-cosine':
            self.distance = AdjustedCosine(shrinkage=self.shrinkage)
        else:
            raise NotImplementedError('Distance {} not implemented'.format(similarity))

    def __str__(self):
        return "Recommender(similarity={},k={},shrinkage={})".format(self.similarity_name, self.k, self.shrinkage)

    
    def fit_new(self, X, noise = 0.1, CF_ratio = 0.5):
        ## GET ISM MATRIX (I X I)
        cp = time.time()
        #Calculate cosine similarity
        print("Using %s filtering with TopK = %s to compute distance." %(self.filter_method, self.topK))
        
        cosine_cython = Cosine_Similarity(URM_train, TopK=self.topK)
        ISM_cf = cosine_cython.compute_similarity()
        
        print("Computed collaborative similarity matrix. %s " %(time.time()-cp))
        cp = time.time()
        
        ISM_cont = self.distance.compute(X)
        print("Computed content based similarity matrix. %s " %(time.time()-cp))
        cp = time.time()
        
        w1 = CF_ratio
        w2 = 1-CF_ratio
        ISM = w1 * ISM_cf + w2 * ISM_cont
        print("Combined similarity matrices with ratio %s. %s " %(CF_ratio, time.time()-cp))
        cp = time.time()
        
        ##GET URM (U X I)
        
        URM = self.dataset
        
        ## GET item_ids (1 x I)
        
        #self.item_ids
        
        ## FILTER item_ids INTO target_item_ids (1 x tI)
        
        self.target_item_ids = track_ids[self.target_item_filter]
        print(URM.nnz)
        print(ISM.nnz)
        
        ## FILTER TARGETED TRACKS
        #Maybe this is not working as expected - are we filtering the right tracks? 
        
        ISM = ISM[:,self.target_item_filter]
        print("Filtered targeted tracks in ISM. %s " %(time.time()-cp))
        cp = time.time()
        
        #self.ISM = sps.csr_matrix(self.ISM)
        
        cp = time.time()  
        print(URM.nnz)
        #ISM = sps.csr_matrix(ISM)
        print(ISM.nnz)
        
        ## CONVERT URM TO CSR
        URM = check_matrix(URM, 'csr')
        print("Checked URM csr %s " %(time.time()-cp))
        cp = time.time()
        
        ##Print dimension
        print(URM.shape)
        print(ISM.shape)
        
        ## MULTIPLY URM (U x I) * ISM (I x I)
        UIM = URM.dot(ISM)
        print("Computed URM * ISM %s " %(time.time()-cp))
        cp = time.time()
        

        
        ## MAKE NOT SPARSE
        #UIM_dense = UIM.todense()
        
        ## FILTER UIM into (U x tI) (not needed since I already filtered!)
        #UIM_dense = UIM_dense[:,self.target_item_filter]
        
        ## THIS IS OUR FITTED MODEL
        self.UIM = UIM
        
        return self.UIM

        
    def recommend_new(self, user_id, at = 5):
        ## GET USER_INDEX
        user_index = playlist_to_index[user_id]
        
        # Convert to np.array (why wasn't it before?!)
        self.target_item_ids = np.array(self.target_item_ids)
        
        ## GET ROW CORRESPONDING TO USER (1 x tI)
        user_weights = self.UIM[user_index,:].toarray()
             
        ## ARGSORT BASED ON AXIS = 0, GET [1,0:at]
        top_indexes = np.argsort(user_weights)#[-at:]
        top_k_indexes = top_indexes[0, -at:]

        ## Translate to indexes
        recommendations = self.target_item_ids[top_k_indexes]
        
        ## RETURN RECOMMENDATIONS
        return(recommendations)
    
    def recommend_dev(self, user_id, at = 5):
        print("Recommend %s items for user %s!" %(at, user_id))
        ## GET USER_INDEX
        user_index = playlist_to_index[user_id]
        
        # Convert to np.array (why wasn't it before?!)
        self.target_item_ids = np.array(self.target_item_ids)
        
        ## GET ROW CORRESPONDING TO USER (1 x tI)
        user_weights = self.UIM[user_index,:].toarray()
             
        ## ARGSORT BASED ON AXIS = 0, GET [1,0:at]
        top_indexes = np.argsort(user_weights)#[-at:]
        print(top_indexes.shape)
        top_k_indexes = top_indexes[0, -at:]
        print(top_k_indexes.shape)

        ## Translate to indexes
        recommendations = self.target_item_ids[top_k_indexes]
        
        ## RETURN RECOMMENDATIONS
        return(recommendations)
        
        
        

    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.dataset[user_id]
        print("User profile: %s" %(user_profile))
        scores = user_profile.dot(self.W_sparse).toarray().ravel()
        print("Scores: %s" %(scores))
        # rank items
        ranking = scores.argsort()[::-1]
        if exclude_seen:
            ranking = self._filter_seen(user_id, ranking)
        
        print("Ranking: %s" %(ranking))
        
        export = [0,0,0,0,0]
        for i in range(5):
            t_id = track_to_id[ranking[i]]
            export[i] = t_id
            
        return export
    def _filter_seen(self, user_id, ranking):
        user_profile = self.dataset[user_id]
        seen = user_profile.indices
        unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
        return ranking[unseen_mask]

print("asd")