In [3]:
# Do all relevant imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
import time
%matplotlib inline
print("Libs imported.")

Libs imported.


In [9]:
import matplotlib.pyplot as plt
def plot_coo_matrix(m):
    if not isinstance(m, sps.coo_matrix):
        m = sps.coo_matrix(m)
    fig = plt.figure()
    ax = fig.add_subplot(111, facecolor='green')
    ax.plot(m.col, m.row, 's', color='black', ms=0.005)
    ax.set_xlim(0, m.shape[1])
    ax.set_ylim(0, m.shape[0])
    ax.set_aspect('equal')
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.invert_yaxis()
    ax.set_aspect('equal')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax

def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)

In [10]:
class Translator(object):
    def __init__(self, tracks_final = [], playlists_final = []): 
        self.track_to_id = {}
        self.track_to_idx = {}
        self.content_to_id = {}
        self.content_to_idx = {}
        self.playlist_to_id = {}
        self.playlist_to_idx = {}     
        
    def create_content_translations(self, tracks_final):
        c_count = 0
        c_idxcount = 0
        
        t_count = 0
        t_idxcount = 0
        for track_id, artist_id, duration, playcount, album_id, tags in tracks_final.values:

            if (track_id > 0): 
                if track_id not in self.track_to_idx: 
                    self.track_to_idx[track_id] = t_idxcount
                    self.track_to_id[t_idxcount] = track_id
                    t_idxcount += 1
            t_count += 1
            
            
            
            if (artist_id > 0): 
                artist = 'ar'+str(artist_id)
                if artist not in self.content_to_idx: 
                    self.content_to_idx[artist] = c_idxcount
                    self.content_to_id[c_idxcount] = artist
                    c_idxcount += 1
            c_count += 1

                    
                    
            album_id.strip('[ ]')
            if (len(album_id) > 0 and album_id is not None and album_id != 'None'): 
                album = 'al'+str(album_id)
                if album not in self.content_to_idx: 
                    self.content_to_idx[album] = c_idxcount
                    self.content_to_id[c_idxcount] = album
                    c_idxcount += 1
            c_count += 1
            
            tags = tags.strip('[ ]').split(', ')
            for tag in tags:
                if (len(tag) > 0 and tag is not None and tag != 'None'): 
                    tag = 'ta'+str(tag)
                    if tag not in self.content_to_idx: 
                        self.content_to_idx[tag] = c_idxcount
                        self.content_to_id[c_idxcount] = tag
                        c_idxcount += 1
                c_count += 1
        try:
            #content_to_idx['alNone']
            #content_to_idx['taNone']
            #content_to_idx['arNone']
            #content_to_idx['ar']
            #content_to_idx['ta']
            #content_to_idx['al']
            pass
            
        except:
            print("Test passed!")

            
        print("Created %s indexes for %s tracks."%(len(self.track_to_idx), tracks_final.shape[0]))
        print("Created %s indexes for %s contents."%(len(self.content_to_idx), c_count))
        print("Total content size: %s" %c_idxcount)

    def create_playlist_translations(self, playlists_final):
        # It would be easy to also create translations for users in this method. Choosing not to implement now. 
        p_idxcount = 0
        p_count = 0
        for playlist_id in playlists_final['playlist_id'].values:
            if (playlist_id > 0): 
                if playlist_id not in self.playlist_to_idx: 
                    self.playlist_to_idx[playlist_id] = p_idxcount
                    self.playlist_to_id[p_idxcount] = playlist_id
                    p_idxcount += 1
            p_count += 1
        
        print("Created %s indexes for %s playlists."%(len(self.playlist_to_idx), playlists_final.shape[0]))
        
        
        
        
    #
    def get_track_id(self, idx):
        return self.track_to_id[idx]#
    def get_track_idx(self, id):
        return self.track_to_idx[id]#
    def get_content_id(self, idx):
        return self.content_to_id[idx]#
    def get_content_idx(self, id):
        return self.content_to_idx[id]#
    def get_playlist_id(self, idx):
        return self.playlist_to_id[idx]#
    def get_playlist_idx(self, id):
        return self.playlist_to_idx[id]
T = Translator()

In [12]:
class Data(object): 
    def __init__(self): 
        #train_final.csv - the training set of interactions
        self.train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

        #tracks_final.csv - supplementary information about the items
        self.tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

        #playlists_final.csv - supplementary information about the users
        self.playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

        #target_playlists.csv - the set of target playlists that will receive recommendations
        self.target_playlists = pd.read_csv('input/target_playlists.csv');

        #target_tracks.csv - the set of target items (tracks) to be recommended
        self.target_tracks = pd.read_csv('input/target_tracks.csv');
                
        T.create_content_translations(self.tracks_final)
        T.create_playlist_translations(self.playlists_final)
        
    def build_target_filter(self): 
        target_filter = np.ones((self.tracks_final.shape[0],1), dtype = bool)
        
        for track_id in self.target_tracks['track_id'].values: 
            track_idx = T.get_track_idx(track_id)
            target_filter[track_idx] = False
        self.ttf = target_filter
        return target_filter

    def build_URMs(self,k = 4): 
        # Creates a self.URM_train and self.URM_test    
        playlistList = self.train_final['playlist_id'].values
        itemList = self.train_final['track_id'].values

        #Translate ids
        playlistList_translated = np.zeros(playlistList.shape)
        itemList_translated = np.zeros(itemList.shape)
        ratingList = np.ones((playlistList.shape), int)
        filter_train = np.ones((playlistList.shape), bool)
        filter_test = np.zeros((playlistList.shape), bool)
        filter_test[0] = True # little workaround
        playlist_counter = {}
        
        
        for i, p_id in enumerate(playlistList):
            p_idx = T.get_playlist_idx(p_id)
            playlistList_translated[i] = p_idx 
            
            i_idx = T.get_track_idx(itemList[i])
            itemList_translated[i] = i_idx
            
            if p_idx not in playlist_counter:
                playlist_counter[p_idx] = 0
                
            if playlist_counter[p_idx] < k:
                filter_train[i] = False # Removes the rating for this particular rating.
                filter_test[i] = True
                playlist_counter[p_idx] += 1 # When k is reached we will remove no more from this pl.                     

        
        ## Build URM_train. 
        URM_train = sps.coo_matrix((ratingList[filter_train], (playlistList_translated[filter_train], itemList_translated[filter_train])))
        URM_train = URM_train.tocsr()

        ## Build URM_train & URM_test
        URM_test = sps.coo_matrix((ratingList[filter_test], (playlistList_translated[filter_test], itemList_translated[filter_test])))
        URM_test= URM_test.tocsr()
         
        self.URM_train = URM_train
        self.URM_test = URM_test
        
        print(URM_test.nnz)
        print(URM_train.nnz)
        print("%s, %s" %(URM_test.nnz + URM_train.nnz, self.train_final.shape[0]))
        print("Removed %s from %s playlists. %s " %(len(filter_test.nonzero()[0]), len(playlist_counter),len(filter_test.nonzero()[0])/ len(playlist_counter) ))
        print(playlist_counter[45648])
        
    def build_ICM(self):
        i = 0
        track_count = 0
        track_index_list = np.zeros([1000000,])
        content_index_list = np.zeros([1000000,])
        for track_id, artist_id, duration, playcount, album_id, tags in self.tracks_final.values:
            track_index_list[i] = T.get_track_idx(track_id)
            content_index_list[i] = T.get_content_idx('ar'+str(artist_id))
            i += 1
            
            track_index_list[i] = T.get_track_idx(track_id)
            content_index_list[i] = T.get_content_idx('al'+str(album_id))
            i += 1
            
            tags = tags.strip('[ ]').split(', ')
            for tag in tags:
                if (len(tag) > 0 and tag is not None and tag != 'None'): 
                    track_index_list[i] = T.get_track_idx(track_id)
                    content_index_list[i] = T.get_content_idx('ta'+str(tag))
                i += 1
                    
                
            #if track_count % 10000 == 0: 
            #    print("Track %s out of 100k" %track_count)
            #    print(i)
            track_count += 1
        
        self.ICM = sps.coo_matrix((np.ones(track_index_list.shape, int), (track_index_list, content_index_list)))
        self.ICM = self.ICM.tocsr()

        
        print("Built ICM with dimensions: Item (%s) x Content (%s) " %self.ICM.shape)
d = Data()
ttf = d.build_target_filter()
print(ttf.shape)
d.build_ICM()
sps.save_npz("Saved Matrixes/ICM_perfect", d.ICM)
print("Saved ICM!")

Created 100000 indexes for 100000 tracks.
Created 77042 indexes for 686290 contents.
Total content size: 0
Created 57561 indexes for 57561 playlists.
(100000, 1)
Built ICM with dimensions: Item (100000) x Content (77042) 
Saved ICM!


In [None]:
def plot_matrices():
    d = Data()
    d.build_URMs(k = 5)
    d.build_ICM()


    ax = plot_coo_matrix(d.ICM)
    ax.figure.show()

    ax = plot_coo_matrix(d.URM_train)
    ax.figure.show()

    ax = plot_coo_matrix(d.URM_test)
    ax.figure.show()
plot_matrices()

In [7]:
class Recommender(object):
    def __init__(self, ICM, URM_train, URM_test, target_tracks_filter, shrinkage = 30, export = False):
        self.URM_train = URM_train
        self.ICM = ICM
        self.ttf = target_tracks_filter
        self.shrinkage = shrinkage
        self.URM_test = URM_test
        self.export = export

        
    def fit(self): 
        print("Fitting..")
        starttime = time.time()
        # Compute ISM (I x I)
        cp = time.time()
        self.ISM = self.ICM * self.ICM.T
        print("Computed ISM %s sec"%(time.time()-cp))
        cp = time.time()
        
        # zero out diagonal values
        dist = dist - sps.dia_matrix((dist.diagonal()[scipy.newaxis, :], [0]), shape=dist.shape)
        print("Removed diagonal")
        
        # and apply the shrinkage
        if self.shrinkage > 0:
            dist = self.apply_shrinkage(self.ICM, self.ISM)
            print("Applied shrinkage %s sec. "%(time.time()-cp))
            cp = time.time()
        
        
        # Filter: only targeted tracks ISM_target (I x tI)
        
        self.ISM_target = self.ISM[:,self.ttf]
        
        print("Filtered untargeted tracks. %s sec" %(time.time()-cp))
        print("ISM_target: I x tI %s %s. " %ISM_target.shape)
        cp = time.time()
        
        # Compute URM (U x I) x ISM_target (I x tI) = pred (U x tI)
        
        self.URM_pred = URM_train * ISM_target
        print("Computed predictions. %s sec" %(time.time()-cp))
        print("URM_pred: U x tI %s %s. " %ISM_target.shape)
        
        
        # Model is now fitted
    
        print("Model fitted in %s sec."%(time.time-starttime))
    
    
    def apply_shrinkage(self, X, dist):
        # create an "indicator" version of X (i.e. replace values in X with ones)
        X_ind = X.copy()
        X_ind.data = np.ones_like(X_ind.data)
        # compute the co-rated counts
        co_counts = X_ind * X_ind.T
        # remove the diagonal
        co_counts = co_counts - sps.dia_matrix((co_counts.diagonal()[scipy.newaxis, :], [0]), shape=co_counts.shape)
        # compute the shrinkage factor as co_counts_ij / (co_counts_ij + shrinkage)
        # then multiply dist with it
        co_counts_shrink = co_counts.copy()
        co_counts_shrink.data += self.shrinkage
        co_counts.data /= co_counts_shrink.data
        dist.data *= co_counts.data
        return dist
    
    def recommend(self, playlist_idx, k):        

        # Filter: no already seen tracks
        seen = URM_train[playlist_idx,:]
        prob = W[playlist_idx,:][seen == False]
        
        # get indices of k largest values
        
        # translate indices to ids and
            
    def get_recommendations(self): 
        zeros = np.zeros((target_playlists.size, 6), dtype = int)
        recommendations = pd.DataFrame(zeros)
        recommendations.columns = ['playlist_id', 1, 2, 3, 4, 5]
        counter = 0
        starttime = time.time()
        for playlist_id in target_playlists['playlist_id']:

            if counter % 1000 == 0: 
                print ("%s out of 10000 playlists, %s sec." %(counter, time.time()-starttime))

            playlist_idx = T.get_playlist_idx[int(playlist_id)]
            recommendations.iloc[counter, 1:6] = rec.recommend_new(playlist_idx, 5)
            recommendations.iloc[counter, 0] = playlist_id
            counter += 1
            
    def evaluate(self): 
        pass

In [8]:
d = Data()
d.build_URMs()
d.build_ICM()
d.build_target_filter()
r = Recommender(d.ICM, d.URM_train, d.URM_test, d.ttf)




Created 100000 indexes for 100000 tracks.
Created 77042 indexes for 686290 contents.
Total content size: 0
Created 57561 indexes for 57561 playlists.
162965
877557
1040522, 1040522
Removed 162965 from 45649 playlists. 3.5699577208701174 
4
Built ICM with dimensions: Item (100000) x Content (77042) 


In [2]:
r.fit()

NameError: name 'r' is not defined

In [1]:
print("hej")

hej


In [None]:
#Evaluation functions

def precision(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def MAP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

def evaluate_algorithm(URM_test, recommendations, at=5):
    
    starttime = time.time()
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0
    
    num_eval = 0
    
    playlists = target_playlists['playlist_id']
    
    for i, playlist_id in enumerate(playlists):
        relevant_items = URM_test[playlist_to_index[playlist_id]].indices
        
        for j, item_id in enumerate(relevant_items):
            relevant_items[j] = track_to_id[item_id]
            
        if i % 500 == 0:
            print("User %d of %d, %d sec." % (i, len(playlists), round(time.time()-starttime)))
            print(relevant_items)
            print(recommendations.iloc[i,1:6])
        
        if len(relevant_items)>0:
            
            recommended_items = recommendations.iloc[i,1:6]
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_MAP += MAP(recommended_items, relevant_items)


    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval
    
    print("Recommender performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP))

