In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as sps
import scipy.io as io
import time
import json
from scipy.sparse.linalg import svds
from sklearn import preprocessing

In [2]:
class RecommenderSystem(object):
    
    def __init__(self, interactions_file = '../input/train_final.csv', 
                       target_playlists = '../input/target_playlists.csv', 
                       target_tracks = '../input/target_tracks.csv',
                       meta_track = '../input/tracks_final.csv'):
        # read interactions file
        train_final = pd.read_csv(interactions_file, sep = '\t')
        train_final['interaction'] = 1
        self.df_interactions = train_final.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
        self.numInteractions = train_final.shape[0]
        print("Number of interactions (numInteractions): " + str(self.numInteractions))
        
        # read target playlists which should receive a recommendation
        self.df_target_playlists = pd.read_csv(target_playlists, sep = '\t')
        self.list_target_playlists = list(self.df_target_playlists['playlist_id'])
        print("Size of df_target_playlists: " + str(self.df_target_playlists.shape))
        
        # read target tracks
        self.df_target_tracks = pd.read_csv(target_tracks, sep = '\t')
        self.list_target_tracks = list(self.df_target_tracks['track_id'])
        print("Size of df_target_tracks file: " + str(self.df_target_tracks.shape))
        print("Size of list_target_tracks file: " + str(len(self.df_target_tracks)))
        print("\n")
        
        # separate each column in list
        playlist_id = list(self.df_interactions['playlist_id'])
        track_id = list(self.df_interactions['track_id'])
        interaction = list(self.df_interactions['interaction'])
        
        playlist_id_unique = list(set(playlist_id))
        self.df_playlist_id_unique = pd.DataFrame(playlist_id_unique)
        self.df_playlist_id_unique.reset_index(level=0, inplace=True)
        self.df_playlist_id_unique.columns = ['index_playlist', 'playlist_id']
        
        track_id_unique = list(set(track_id))
        self.df_track_id_unique = pd.DataFrame(track_id_unique)
        self.df_track_id_unique.reset_index(level=0, inplace=True)
        self.df_track_id_unique.columns = ['index_track', 'track_id']
        print("Track_id translated to indexes (df_track_id_unique): ")
        print(self.df_track_id_unique.head())
        print("\n")
        print("Playlist_id translated to indexes (df_playlist_id_unique): ")
        print(self.df_playlist_id_unique.head())
        print("\n")
        
        self.numPlaylists = len(self.df_playlist_id_unique)
        self.numTracks = len(self.df_track_id_unique)
        print("Number of Playlists: " + str(self.numPlaylists))
        print("Number of Tracks: " + str(self.numTracks))
        print("\n")
        
        self.df_interactions = self.df_interactions.merge(self.df_playlist_id_unique, how='inner', on='playlist_id')
        self.df_interactions = self.df_interactions.merge(self.df_track_id_unique, how='inner', on='track_id')
        self.df_interactions = self.df_interactions.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
        print("Interactions-file with IDs translated to indexes (saved in df_interactions): ")
        print(self.df_interactions.head())
        print("\n")
        
        self.list_index_playlist = np.array(self.df_interactions['index_playlist'])
        self.list_index_track = np.array(self.df_interactions['index_track'])
        self.list_interactions = np.array(self.df_interactions['interaction'])
        
        self.df_tracks = pd.read_csv(meta_track, sep = '\t')
        self.df_tracks = self.df_tracks.merge(self.df_track_id_unique, how='inner', on='track_id')
        self.df_tracks['tags'] = self.df_tracks.tags.apply(json.loads)
        self.df_tracks['album'] = self.df_tracks.album.apply(lambda x: (str(x[1:-1]) + "a") if x != "[None]" and x != "[]" else "-10a")
        print('Meta information about tracks read (df_tracks): ')
        print(self.df_tracks.head())
        print(self.df_tracks.shape)
        
    def target_structure(self):
        # filter interaction dataframe, to retain only target playlists
        train = self.df_interactions.merge(self.df_target_playlists, how='inner', on='playlist_id')
        
        # aggregate to playlist level and coerce tracks in that playlist to list
        train_agg1 = train.groupby(by='playlist_id').track_id.apply(list).to_frame()
        train_agg1.reset_index(level=0, inplace=True)
        train_agg2 = train.groupby(by='playlist_id').index_track.apply(list).to_frame()
        train_agg2.reset_index(level=0, inplace=True)
        train_agg = train_agg1.merge(train_agg2, how='inner', on='playlist_id')
        self.df_target = train_agg.merge(self.df_playlist_id_unique, how='inner', on='playlist_id')
        self.df_target['recommend'] = np.empty((len(train_agg), 0)).tolist()
        print("Data structure for final prediction was created (df_target): ")
        print(self.df_target.head())
        print(self.df_target.shape)
        
    def interaction_aggregation(self):
        
        agg1 = self.df_interactions.groupby(by='playlist_id').track_id.apply(list).to_frame()
        agg1.reset_index(level=0, inplace=True)
        agg2 = self.df_interactions.groupby(by='playlist_id').index_track.apply(list).to_frame()
        agg2.reset_index(level=0, inplace=True)
        agg3 = self.df_interactions.groupby(by='playlist_id').nunique()
        agg3.reset_index(level=0, inplace=True)
        agg = agg1.merge(agg2, how='inner', on='playlist_id')
        agg = agg.merge(agg3, how='inner', on='playlist_id')
        print(agg[:10])
        
    def create_uim(self, sparse_mode="coo", create_testset = False, split = 0.8):
        if sparse_mode.lower() == "coo" or sparse_mode.lower() == "csr":
            self.UIM = sps.coo_matrix((self.list_interactions, (self.list_index_playlist, self.list_index_track)))
            if create_testset:
                self.split_traintest(train_test_split = split)
            if sparse_mode.lower() == "csr" and create_testset != True:
                self.UIM = self.UIM.tocsr()
            elif sparse_mode.lower() == "csr" and create_testset == True:
                self.UIM = self.UIM.tocsr()
                self.UIM_train = self.UIM_train.tocsr()
                self.UIM_test = self.UIM_test.tocsr()
                
        else:
            raise NotImplementedError('Sparse mode not implemented'.format(sparse_mode))
            
    def split_traintest(self, train_test_split):
        train_mask = np.random.choice([True,False], self.numInteractions, p=[train_test_split, 1-train_test_split])
        test_mask = np.logical_not(train_mask)
        self.UIM_train = sps.coo_matrix((self.list_interactions[train_mask], 
                                        (self.list_index_playlist[train_mask], 
                                         self.list_index_track[train_mask])))
        self.UIM_test = sps.coo_matrix((self.list_interactions, (self.list_index_playlist, self.list_index_track)))
        print("UIM successfully created in csr format.")
        
    def create_icm(self, include_tags = True, include_album = True, include_artist = True):
        tags_list = []
        for index, row in self.df_tracks.iterrows():
            if len(row['tags']) != 0 and include_tags:
                for i in row['tags']:
                    tags_list.append([row['index_track'], i, 1])
            if row['album'] != "-10a" and include_album:
                tags_list.append([row['index_track'], row['album'], 1])
            if include_artist:
                tags_list.append([row['index_track'], str(row['artist_id']) + "b", 1])
        tags_list = pd.DataFrame(tags_list)
        tags_list.columns = ['index_track', 'tag', 'interaction']
        track_list = list(tags_list['index_track'])
        tag_list = list(tags_list['tag'])
        self.final_taglist = list(tags_list['tag'])
        interaction_list = list(tags_list['interaction'])
        le = preprocessing.LabelEncoder()
        le.fit(tag_list)
        taglist_icm = le.transform(tag_list)
        self.ICM = sps.coo_matrix((interaction_list, (track_list, taglist_icm)))
        self.ICM = self.ICM.tocsr()
        print("ICM successfully created in csr format.")

In [3]:
class BasicItemKNNRecommender(RecommenderSystem):
       
    def __str__(self):
        return "ItemKNN(similarity={},k={},shrinkage={})".format(self.similarity_name, self.k, self.shrinkage)
    
    def fit(self, shrinkage=100, similarity='cosine'):
        self.shrinkage = shrinkage
        self.similarity_name = similarity
        if similarity == 'cosine':
            self.distance = Cosine(shrinkage=self.shrinkage)
        elif similarity == 'pearson':
            self.distance = Pearson(shrinkage=self.shrinkage)
        elif similarity == 'adj-cosine':
            self.distance = AdjustedCosine(shrinkage=self.shrinkage)
        else:
            raise NotImplementedError('Distance {} not implemented'.format(similarity))
        
        self.create_uim(sparse_mode = 'csr')
        self.create_icm()
        
        # ok
        item_weights = self.distance.compute(self.ICM)
        
        item_weights = check_matrix(item_weights, 'csr') # nearly 10 times faster
        print("Converted to csr")
        
        self.W = item_weights
        self.UIM_estm = self.UIM.dot(self.W)
        print('UIM_estm calculated')

    def recommend(self, at=5):
        self.target_structure()
        start_time = time.time()
        for index, row in self.df_target.iterrows():          
            #get row from URM_estm
            estm = pd.DataFrame(self.UIM_estm[row['index_playlist'],:].T.toarray())
            estm.reset_index(level=0, inplace=True)
            estm.columns = ['index_track','pred']
            # filter tracks which are already in the playlist, so they can't be recommended
            estm = estm[-estm["index_track"].isin(row['index_track'])]
            # translate track index back to track_id
            estm = estm.merge(self.df_track_id_unique, how='inner', on='index_track')
            # filter on target track set
            estm = estm[estm['track_id'].isin(self.list_target_tracks)]
            estm = estm.sort_values('pred',ascending=False)
            # print(estm)
            count = 1
            for index2, row2 in estm.iterrows():
                # insert 5 top recommendations into dataframe
                if count < (at + 1):
                    row['recommend'].append(int(row2['track_id']))
                    count += 1
                else:
                    break
        print("--- %s minutes ---" % ((time.time() - start_time)/60))

def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)
    
class ISimilarity(object):
    """Abstract interface for the similarity metrics"""

    def __init__(self, shrinkage=10):
        self.shrinkage = shrinkage

    def compute(self, X):
        pass


class Cosine(ISimilarity):
    def compute(self, X):
        # convert to csc matrix for faster column-wise operations
        X = check_matrix(X, 'csc', dtype=np.float32)

        # 1) normalize the columns in X
        # compute the column-wise norm
        # NOTE: this is slightly inefficient. We must copy X to compute the column norms.
        # A faster solution is to  normalize the matrix inplace with a Cython function.
        Xsq = X.copy()
        Xsq.data **= 2
        norm = np.sqrt(Xsq.sum(axis=0))
        norm = np.asarray(norm).ravel()
        norm += 1e-6
        # compute the number of non-zeros in each column
        # NOTE: this works only if X is instance of sparse.csc_matrix
        col_nnz = np.diff(X.indptr)
        # then normalize the values in each column
        X.data /= np.repeat(norm, col_nnz)
        print("Normalized")

        # 2) compute the cosine similarity using the dot-product
        dist = X * X.T
        print("Computed")
        
        # zero out diagonal values
        dist = dist - sps.dia_matrix((dist.diagonal()[sp.newaxis, :], [0]), shape=dist.shape)
        print("Removed diagonal")
        
        # and apply the shrinkage
        if self.shrinkage > 0:
            dist = self.apply_shrinkage(X, dist)
            print("Applied shrinkage")    
        
        return dist

    def apply_shrinkage(self, X, dist):
        # create an "indicator" version of X (i.e. replace values in X with ones)
        X_ind = X.copy()
        X_ind.data = np.ones_like(X_ind.data)
        # compute the co-rated counts
        co_counts = X_ind * X_ind.T
        # remove the diagonal
        co_counts = co_counts - sps.dia_matrix((co_counts.diagonal()[sp.newaxis, :], [0]), shape=co_counts.shape)
        # compute the shrinkage factor as co_counts_ij / (co_counts_ij + shrinkage)
        # then multiply dist with it
        co_counts_shrink = co_counts.copy()
        co_counts_shrink.data += self.shrinkage
        co_counts.data /= co_counts_shrink.data
        dist.data *= co_counts.data
        return dist

In [4]:
cbf = BasicItemKNNRecommender()

Number of interactions (numInteractions): 1040522
Size of df_target_playlists: (10000, 1)
Size of df_target_tracks file: (32195, 1)
Size of list_target_tracks file: 32195


Track_id translated to indexes (df_track_id_unique): 
   index_track  track_id
0            0   1048594
1            1   2359314
2            2   1835030
3            3   3670041
4            4   1048604


Playlist_id translated to indexes (df_playlist_id_unique): 
   index_playlist  playlist_id
0               0     10485762
1               1      5767174
2               2      7077894
3               3     11534344
4               4      1179658


Number of Playlists: 45649
Number of Tracks: 99999


Interactions-file with IDs translated to indexes (saved in df_interactions): 
     playlist_id  track_id  interaction  index_playlist  index_track
0           7569    162463            1            2425        62358
87          7569    421750            1            2425        60999
116         7569    795606         

In [5]:
cbf.fit(shrinkage=50)

ICM successfully created in csr format.
Normalized
Computed
Removed diagonal
Applied shrinkage
Converted to csr
UIM_estm calculated


In [6]:
cbf.UIM_estm.shape

(45649, 99999)

In [7]:
cbf.recommend()

Data structure for final prediction was created (df_target): 
   playlist_id                                           track_id  \
0         7614  [415173, 1384962, 1609224, 1614974, 1714787, 2...   
1         7692  [88210, 266898, 280844, 302730, 384386, 551534...   
2         7816  [126414, 245217, 513821, 611201, 767305, 84510...   
3         8225  [13881, 261448, 311923, 500672, 676393, 906185...   
4         8337  [451881, 1157460, 1205536, 1210884, 3131838, 3...   

                                         index_track  index_playlist recommend  
0  [58038, 27294, 13601, 15634, 53615, 16590, 739...            2447        []  
1  [32883, 1416, 6262, 15120, 46326, 9696, 41315,...            2478        []  
2  [47911, 94663, 96789, 32334, 93906, 21551, 971...            2519        []  
3  [4523, 99812, 18243, 92598, 57550, 45423, 5806...            2679        []  
4          [74517, 40765, 59706, 62219, 95577, 7177]            2715        []  
(10000, 5)
--- 8.073137716452282 minu

In [8]:
cbf.df_target

Unnamed: 0,playlist_id,track_id,index_track,index_playlist,recommend
0,7614,"[415173, 1384962, 1609224, 1614974, 1714787, 2...","[58038, 27294, 13601, 15634, 53615, 16590, 739...",2447,"[3690005, 1839528, 2550706, 2368011, 2828149]"
1,7692,"[88210, 266898, 280844, 302730, 384386, 551534...","[32883, 1416, 6262, 15120, 46326, 9696, 41315,...",2478,"[3139217, 2820855, 2692619, 338400, 1471766]"
2,7816,"[126414, 245217, 513821, 611201, 767305, 84510...","[47911, 94663, 96789, 32334, 93906, 21551, 971...",2519,"[932946, 639543, 1162917, 1347178, 2621594]"
3,8225,"[13881, 261448, 311923, 500672, 676393, 906185...","[4523, 99812, 18243, 92598, 57550, 45423, 5806...",2679,"[3412271, 3160513, 3500909, 1573721, 1115491]"
4,8337,"[451881, 1157460, 1205536, 1210884, 3131838, 3...","[74517, 40765, 59706, 62219, 95577, 7177]",2715,"[3167821, 3423162, 867355, 3615514, 2553688]"
5,8369,"[701941, 826559, 1042548, 1886070, 2165768, 26...","[69103, 14978, 98013, 18680, 25190, 28878, 307...",2728,"[2801398, 3842207, 3251559, 920338, 1797896]"
6,8446,"[161823, 276258, 287155, 307055, 331475, 35788...","[61924, 4656, 8678, 16694, 25495, 35633, 44462...",2757,"[2398951, 3212294, 3502746, 2347248, 1971374]"
7,8559,"[396062, 949534, 1188811, 1831605, 2142207, 34...","[50715, 62735, 53064, 98863, 16753, 17606]",2792,"[2454612, 2848029, 360800, 3670757, 152898]"
8,8636,"[87720, 98950, 355175, 410709, 433616, 434786,...","[32678, 36887, 34538, 56084, 66720, 67104, 866...",2809,"[162658, 1214452, 2370998, 2146850, 2821432]"
9,9344,"[497514, 693228, 1075400, 1452231, 1529980, 18...","[91737, 65530, 9466, 53495, 86380, 20679, 2067...",3051,"[2788956, 1156133, 3602562, 1998474, 2247568]"


In [9]:
# Convert list to string with spaces between track_ids
cbf.df_target['recommend'] = cbf.df_target['recommend'].apply(lambda x: " ".join(map(str, x)))

In [10]:
# rename columns for submission
final = cbf.df_target[['playlist_id','recommend']]
final.columns = ['playlist_id','track_ids']

In [11]:
print(final.head())

   playlist_id                                track_ids
0         7614  3690005 1839528 2550706 2368011 2828149
1         7692   3139217 2820855 2692619 338400 1471766
2         7816    932946 639543 1162917 1347178 2621594
3         8225  3412271 3160513 3500909 1573721 1115491
4         8337   3167821 3423162 867355 3615514 2553688


In [12]:
# export file
final.to_csv('../submission/005_cbf_s50_albumartist.csv', index=False)

In [None]:
test = [1,2,3,4,2,4,2,'10a','12a','14a']
lenc = preprocessing.LabelEncoder()
lenc.fit(test)
listtag = lenc.transform(test)

In [None]:
listtag

In [None]:
test = '0123456'
test[1:-1]