# Content based Similarity approach

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as sps
import scipy.io as io
import time
import json
from scipy.sparse.linalg import svds
from sklearn import preprocessing

In [13]:
class RecommenderSystem(object):
    
    def __init__(self, interactions_file = '../input/train_final.csv', 
                       target_playlists = '../input/target_playlists.csv', 
                       target_tracks = '../input/target_tracks.csv',
                       meta_track = '../input/tracks_final.csv'):
        # read interactions file
        train_final = pd.read_csv(interactions_file, sep = '\t')
        train_final['interaction'] = 1
        self.df_interactions = train_final.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
        self.numInteractions = train_final.shape[0]
        print("Number of interactions (numInteractions): " + str(self.numInteractions))
        
        # read target playlists which should receive a recommendation
        self.df_target_playlists = pd.read_csv(target_playlists, sep = '\t')
        self.list_target_playlists = list(self.df_target_playlists['playlist_id'])
        print("Size of df_target_playlists: " + str(self.df_target_playlists.shape))
        
        # read target tracks
        self.df_target_tracks = pd.read_csv(target_tracks, sep = '\t')
        self.list_target_tracks = list(self.df_target_tracks['track_id'])
        print("Size of df_target_tracks file: " + str(self.df_target_tracks.shape))
        print("Size of list_target_tracks file: " + str(len(self.df_target_tracks)))
        print("\n")
        
        # separate each column in list
        playlist_id = list(self.df_interactions['playlist_id'])
        track_id = list(self.df_interactions['track_id'])
        interaction = list(self.df_interactions['interaction'])
        
        playlist_id_unique = list(set(playlist_id))
        self.df_playlist_id_unique = pd.DataFrame(playlist_id_unique)
        self.df_playlist_id_unique.reset_index(level=0, inplace=True)
        self.df_playlist_id_unique.columns = ['index_playlist', 'playlist_id']
        
        track_id_unique = list(set(track_id))
        self.df_track_id_unique = pd.DataFrame(track_id_unique)
        self.df_track_id_unique.reset_index(level=0, inplace=True)
        self.df_track_id_unique.columns = ['index_track', 'track_id']
        print("Track_id translated to indexes: ")
        print(self.df_track_id_unique.head())
        print("\n")
        print("Playlist_id translated to indexes: ")
        print(self.df_playlist_id_unique.head())
        print("\n")
        
        self.numPlaylists = len(self.df_playlist_id_unique)
        self.numTracks = len(self.df_track_id_unique)
        print("Number of Playlists: " + str(self.numPlaylists))
        print("Number of Tracks: " + str(self.numTracks))
        print("\n")
        
        self.df_interactions = self.df_interactions.merge(self.df_playlist_id_unique, how='inner', on='playlist_id')
        self.df_interactions = self.df_interactions.merge(self.df_track_id_unique, how='inner', on='track_id')
        self.df_interactions = self.df_interactions.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
        print("Interactions-file with IDs translated to indexes (saved in df_interactions): ")
        print(self.df_interactions.head())
        print("\n")
        
        self.list_index_playlist = np.array(self.df_interactions['index_playlist'])
        self.list_index_track = np.array(self.df_interactions['index_track'])
        self.list_interactions = np.array(self.df_interactions['interaction'])
        
        self.df_tracks = pd.read_csv(meta_track, sep = '\t')
        self.df_tracks = self.df_tracks.merge(self.df_track_id_unique, how='inner', on='track_id')
        self.df_tracks['tags'] = self.df_tracks.tags.apply(json.loads)
        print('Meta information about tracks read (df_tracks): ')
        print(self.df_tracks.head())
        print(self.df_tracks.shape)
        
    def target_structure(self):
        # filter interaction dataframe, to retain only target playlists
        train = self.df_interactions.merge(self.df_target_playlists, how='inner', on='playlist_id')
        
        # aggregate to playlist level and coerce tracks in that playlist to list
        train_agg1 = train.groupby(by='playlist_id').track_id.apply(list).to_frame()
        train_agg1.reset_index(level=0, inplace=True)
        train_agg2 = train.groupby(by='playlist_id').index_track.apply(list).to_frame()
        train_agg2.reset_index(level=0, inplace=True)
        train_agg = train_agg1.merge(train_agg2, how='inner', on='playlist_id')
        self.df_target = train_agg.merge(self.df_playlist_id_unique, how='inner', on='playlist_id')
        self.df_target['recommend'] = np.empty((len(train_agg), 0)).tolist()
        print("Data structure for final prediction was created (df_target): ")
        print(self.df_target.head())
        print(self.df_target.shape)
        
    def interaction_aggregation(self):
        
        agg1 = self.df_interactions.groupby(by='playlist_id').track_id.apply(list).to_frame()
        agg1.reset_index(level=0, inplace=True)
        agg2 = self.df_interactions.groupby(by='playlist_id').index_track.apply(list).to_frame()
        agg2.reset_index(level=0, inplace=True)
        agg3 = self.df_interactions.groupby(by='playlist_id').nunique()
        agg3.reset_index(level=0, inplace=True)
        agg = agg1.merge(agg2, how='inner', on='playlist_id')
        agg = agg.merge(agg3, how='inner', on='playlist_id')
        print(agg[:10])
        
    def create_uim(self, sparse_mode="coo", create_testset = True, split = 0.8):
        if sparse_mode.lower() == "coo" or sparse_mode.lower() == "csr":
            self.UIM = sps.coo_matrix((self.list_interactions, (self.list_index_playlist, self.list_index_track)))
            if create_testset:
                self.split_traintest(train_test_split = split)
            if sparse_mode.lower() == "csr" and create_testset != True:
                self.UIM = self.UIM.tocsr()
            elif sparse_mode.lower() == "csr" and create_testset == True:
                self.UIM = self.UIM.tocsr()
                self.UIM_train = self.UIM_train.tocsr()
                self.UIM_test = self.UIM_test.tocsr()
                
        else:
            raise NotImplementedError('Sparse mode not implemented'.format(sparse_mode))
            
    def split_traintest(self, train_test_split):
        train_mask = np.random.choice([True,False], self.numInteractions, p=[train_test_split, 1-train_test_split])
        test_mask = np.logical_not(train_mask)
        self.UIM_train = sps.coo_matrix((self.list_interactions[train_mask], 
                                        (self.list_index_playlist[train_mask], 
                                         self.list_index_track[train_mask])))
        self.UIM_test = sps.coo_matrix((self.list_interactions, (self.list_index_playlist, self.list_index_track)))
        print("UIM successfully created in csr format.")
        
    def create_icm(self):
        tags_list = []
        for index, row in self.df_tracks.iterrows():
            if len(row['tags']) != 0:
                for i in row['tags']:
                    tags_list.append([row['index_track'], i, 1])
        tags_list = pd.DataFrame(tags_list)
        tags_list.columns = ['index_track', 'tag', 'interaction']
        track_list = list(tags_list['index_track'])
        tag_list = list(tags_list['tag'])
        interaction_list = list(tags_list['interaction'])
        le = preprocessing.LabelEncoder()
        le.fit(tag_list)
        taglist_icm = le.transform(tag_list)
        self.ICM = sps.coo_matrix((interaction_list, (track_list, taglist_icm)))
        self.ICM = self.ICM.tocsr()
        print("ICM successfully created in csr format.")

## Define functions to calculate quality metrics:

In [120]:
def MAP(recommended_items, relevant_items):
   
    is_relevant = np.isin(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

def evaluate_algorithm(URM_test, recommender_object, at=5):

    cumulative_MAP = 0.0
    
    num_eval = 0

    start_time = time.time()
    for i, user_id in  enumerate(index_playlist_unique):
        
        if i % 5000 == 0:
            print("User %d of %d" % (i, len(index_playlist_unique)))
            print("--- %s seconds ---" % (time.time() - start_time))

        relevant_items = URM_test[user_id].indices
        
        if len(relevant_items)>0:
            
            recommended_items = recommender_object.recommend(user_id, at=at)
            num_eval+=1

            cumulative_MAP += MAP(recommended_items, relevant_items)
        
    cumulative_MAP /= num_eval
    
    print("Recommender performance is: MAP = {:.4f}".format(cumulative_MAP))

In [121]:
class BasicItemKNNRecommender(RecommenderSystem):
       
    def __str__(self):
        return "ItemKNN(similarity={},k={},shrinkage={})".format(self.similarity_name, self.k, self.shrinkage)
    
    def fit(self, k=50, shrinkage=100, similarity='cosine'):
        self.k = k
        self.shrinkage = shrinkage
        self.similarity_name = similarity
        if similarity == 'cosine':
            self.distance = Cosine(shrinkage=self.shrinkage)
        elif similarity == 'pearson':
            self.distance = Pearson(shrinkage=self.shrinkage)
        elif similarity == 'adj-cosine':
            self.distance = AdjustedCosine(shrinkage=self.shrinkage)
        else:
            raise NotImplementedError('Distance {} not implemented'.format(similarity))
        
        self.create_uim(sparse_mode = 'csr')
        self.create_icm()
        
        # ok
        item_weights = self.distance.compute(self.ICM)
        
        item_weights = check_matrix(item_weights, 'csr') # nearly 10 times faster
        print("Converted to csr")
        
        # for each column, keep only the top-k scored items
        # THIS IS THE SLOW PART, FIND A BETTER SOLUTION        
        values, rows, cols = [], [], []
        nitems = self.UIM_train.shape[1]
        for i in range(nitems):
            if (i % 10000 == 0):
                print("Item %d of %d" % (i, nitems))
                
            this_item_weights = item_weights[i,:].toarray()[0]
            top_k_idx = np.argsort(this_item_weights) [-self.k:]
                        
            values.extend(this_item_weights[top_k_idx])
            rows.extend(np.arange(nitems)[top_k_idx])
            cols.extend(np.ones(self.k) * i)
        self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)

    def recommend(self, user_id, at=5):
        return 0

def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)
    
class ISimilarity(object):
    """Abstract interface for the similarity metrics"""

    def __init__(self, shrinkage=10):
        self.shrinkage = shrinkage

    def compute(self, X):
        pass


class Cosine(ISimilarity):
    def compute(self, X):
        # convert to csc matrix for faster column-wise operations
        X = check_matrix(X, 'csc', dtype=np.float32)

        # 1) normalize the columns in X
        # compute the column-wise norm
        # NOTE: this is slightly inefficient. We must copy X to compute the column norms.
        # A faster solution is to  normalize the matrix inplace with a Cython function.
        Xsq = X.copy()
        Xsq.data **= 2
        norm = np.sqrt(Xsq.sum(axis=0))
        norm = np.asarray(norm).ravel()
        norm += 1e-6
        # compute the number of non-zeros in each column
        # NOTE: this works only if X is instance of sparse.csc_matrix
        col_nnz = np.diff(X.indptr)
        # then normalize the values in each column
        X.data /= np.repeat(norm, col_nnz)
        print("Normalized")

        # 2) compute the cosine similarity using the dot-product
        dist = X * X.T
        print("Computed")
        
        # zero out diagonal values
        dist = dist - sps.dia_matrix((dist.diagonal()[sp.newaxis, :], [0]), shape=dist.shape)
        print("Removed diagonal")
        
        # and apply the shrinkage
        if self.shrinkage > 0:
            dist = self.apply_shrinkage(X, dist)
            print("Applied shrinkage")    
        
        return dist

    def apply_shrinkage(self, X, dist):
        # create an "indicator" version of X (i.e. replace values in X with ones)
        X_ind = X.copy()
        X_ind.data = np.ones_like(X_ind.data)
        # compute the co-rated counts
        co_counts = X_ind * X_ind.T
        # remove the diagonal
        co_counts = co_counts - sps.dia_matrix((co_counts.diagonal()[sp.newaxis, :], [0]), shape=co_counts.shape)
        # compute the shrinkage factor as co_counts_ij / (co_counts_ij + shrinkage)
        # then multiply dist with it
        co_counts_shrink = co_counts.copy()
        co_counts_shrink.data += self.shrinkage
        co_counts.data /= co_counts_shrink.data
        dist.data *= co_counts.data
        return dist

In [14]:
test = RecommenderSystem()
test.target_structure()

Number of interactions (numInteractions): 1040522
Size of df_target_playlists: (10000, 1)
Size of df_target_tracks file: (32195, 1)
Size of list_target_tracks file: 32195


Track_id translated to indexes: 
   index_track  track_id
0            0   1048594
1            1   2359314
2            2   1835030
3            3   3670041
4            4   1048604


Playlist_id translated to indexes: 
   index_playlist  playlist_id
0               0     10485762
1               1      5767174
2               2      7077894
3               3     11534344
4               4      1179658


Number of Playlists: 45649
Number of Tracks: 99999


Interactions-file with IDs translated to indexes (saved in df_interactions): 
     playlist_id  track_id  interaction  index_playlist  index_track
0           7569    162463            1            2425        62358
87          7569    421750            1            2425        60999
116         7569    795606            1            2425         3009
125        

In [67]:
blabla = test.df_interactions.merge(test.df_target_playlists, how='inner', on='playlist_id')

agg1 = blabla.groupby(by='playlist_id').track_id.apply(list).to_frame()
agg1.reset_index(level=0, inplace=True)
agg2 = blabla.groupby(by='playlist_id').index_track.apply(list).to_frame()
agg2.reset_index(level=0, inplace=True)
agg3 = blabla.groupby(by='playlist_id').track_id.nunique().to_frame()
agg3.reset_index(level=0, inplace=True)
agg3.columns = ['playlist_id', 'nrTracks']
agg = agg1.merge(agg2, how='inner', on='playlist_id')
agg = agg.merge(agg3, how='inner', on='playlist_id')

agg[agg['nrTracks'] >= 10]


Unnamed: 0,playlist_id,track_id,index_track,nrTracks
0,7614,"[415173, 1384962, 1609224, 1614974, 1714787, 2...","[58038, 27294, 13601, 15634, 53615, 16590, 739...",11
1,7692,"[88210, 266898, 280844, 302730, 384386, 551534...","[32883, 1416, 6262, 15120, 46326, 9696, 41315,...",43
2,7816,"[126414, 245217, 513821, 611201, 767305, 84510...","[47911, 94663, 96789, 32334, 93906, 21551, 971...",30
3,8225,"[13881, 261448, 311923, 500672, 676393, 906185...","[4523, 99812, 18243, 92598, 57550, 45423, 5806...",40
6,8446,"[161823, 276258, 287155, 307055, 331475, 35788...","[61924, 4656, 8678, 16694, 25495, 35633, 44462...",62
8,8636,"[87720, 98950, 355175, 410709, 433616, 434786,...","[32678, 36887, 34538, 56084, 66720, 67104, 866...",67
9,9344,"[497514, 693228, 1075400, 1452231, 1529980, 18...","[91737, 65530, 9466, 53495, 86380, 20679, 2067...",17
10,9444,"[221278, 259211, 351144, 367068, 396216, 41224...","[87103, 99031, 33212, 39101, 50759, 56762, 841...",94
11,10050,"[63128, 140114, 199134, 209196, 644412, 853629...","[23288, 53020, 78649, 82811, 45616, 24713, 444...",32
12,10732,"[143874, 150092, 222086, 300102, 335105, 33561...","[54515, 56760, 87476, 14125, 26726, 26945, 422...",61


In [62]:
greater10 = agg[agg['nrTracks'] >= 10]
mask = np.random.choice([True, False], greater10.shape[0], p=[10000/greater10.shape[0], 1-(10000/greater10.shape[0])])
greater10[mask].shape

(10040, 4)

In [56]:
distance = Cosine(shrinkage = 10).compute(test.ICM)

NameError: name 'Cosine' is not defined

In [128]:
item_weights = check_matrix(distance, 'csr')

In [130]:
values, rows, cols = [], [], []
nitems = test.numTracks
i = 0

In [132]:
this_item_weights = item_weights[i,:].toarray()[0]
this_item_weights.shape

(99999,)

In [134]:
top_k_idx = np.argsort(this_item_weights) [-50:]
top_k_idx

array([10968, 46466, 77946, 49462, 42596, 82668, 30275, 72729, 89633,
       56618, 30621, 42102,  5861, 66825, 15856, 36324,  8495,  9849,
       29486, 90849, 27381, 78505, 65078, 27818, 45161, 98501, 37273,
       53495, 67208, 27286, 29714, 56555, 37488, 56563, 99854, 56585,
       81296,  2656, 56608, 90744, 45764, 63989, 59019, 78805, 85044,
       40249, 50469, 93721, 42021, 35636])

In [138]:
values.extend(this_item_weights[top_k_idx])
rows.extend(np.arange(nitems)[top_k_idx])
cols.extend(np.ones(50) * i)
W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)

In [137]:
print(values)
print(rows)
print(cols)

[0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.00082847819, 0.0008911246, 0.0008911246, 0.0008911246, 0.0010522697, 0.0010522697, 0.0010522697, 0.0010522697, 0.0010522697, 0.0010522697]
[10968, 46466, 77946, 49462, 42596, 82668, 30275, 72729, 89633, 56618, 30621, 42102, 5861, 66825, 15856, 36324, 8495, 9849, 29486, 90849, 27381, 78505, 65078, 27818, 45161, 98501, 37273, 53495, 67208, 27286, 29714, 56555, 37488, 56563, 99854, 56585, 81296, 2

In [139]:
W_sparse

<99999x99999 sparse matrix of type '<class 'numpy.float32'>'
	with 50 stored elements in Compressed Sparse Column format>