In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as sps
import scipy.io as io
import time
import json
from scipy.sparse.linalg import svds
from sklearn import preprocessing

In [2]:
# function to save a csr sparse matrix
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

# function to read written csr sparse matrix
def load_sparse_csr(filename):
    loader = np.load(filename)
    return sps.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                          shape = loader['shape'])

In [3]:
%load_ext Cython

In [4]:
class RecommenderSystem(object):
    
    def __init__(self, interactions_file = '../input/train_final.csv', 
                       target_playlists = '../input/target_playlists.csv', 
                       target_tracks = '../input/target_tracks.csv',
                       meta_track = '../input/tracks_final.csv'):
        # read interactions file
        train_final = pd.read_csv(interactions_file, sep = '\t')
        train_final['interaction'] = 1.0
        self.df_interactions = train_final.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
        self.numInteractions = train_final.shape[0]
        print("Number of interactions (numInteractions): " + str(self.numInteractions))
        print("\n")
        
        # separate each column in list
        playlist_id = list(self.df_interactions['playlist_id'])
        track_id = list(self.df_interactions['track_id'])
        interaction = list(self.df_interactions['interaction'])
        
        playlist_id_unique = list(set(playlist_id))
        self.df_playlist_id_unique = pd.DataFrame(playlist_id_unique)
        self.df_playlist_id_unique.reset_index(level=0, inplace=True)
        self.df_playlist_id_unique.columns = ['index_playlist', 'playlist_id']
        
        track_id_unique = list(set(track_id))
        self.df_track_id_unique = pd.DataFrame(track_id_unique)
        self.df_track_id_unique.reset_index(level=0, inplace=True)
        self.df_track_id_unique.columns = ['index_track', 'track_id']
        print("Track_id translated to indexes (df_track_id_unique): ")
        print(self.df_track_id_unique.head())
        print("\n")
        print("Playlist_id translated to indexes (df_playlist_id_unique): ")
        print(self.df_playlist_id_unique.head())
        print("\n")
        
        # read target playlists which should receive a recommendation
        self.df_target_playlists = pd.read_csv(target_playlists, sep = '\t')
        self.list_target_playlists = list(self.df_target_playlists['playlist_id'])
        self.df_target_playlists = self.df_target_playlists.merge(self.df_playlist_id_unique, how='inner', on='playlist_id')
        print("Size of df_target_playlists: " + str(self.df_target_playlists.shape))
        
        # read target tracks
        self.df_target_tracks = pd.read_csv(target_tracks, sep = '\t')
        self.list_target_tracks = list(self.df_target_tracks['track_id'])
        self.df_target_tracks = self.df_target_tracks.merge(self.df_track_id_unique, how='inner', on='track_id')
        print("Size of df_target_tracks file: " + str(self.df_target_tracks.shape))
        print("Size of list_target_tracks file: " + str(len(self.df_target_tracks)))
        print("\n")
        
        self.numPlaylists = len(self.df_playlist_id_unique)
        self.numTracks = len(self.df_track_id_unique)
        print("Number of Playlists: " + str(self.numPlaylists))
        print("Number of Tracks: " + str(self.numTracks))
        print("\n")
        
        self.df_interactions = self.df_interactions.merge(self.df_playlist_id_unique, how='inner', on='playlist_id')
        self.df_interactions = self.df_interactions.merge(self.df_track_id_unique, how='inner', on='track_id')
        self.df_interactions = self.df_interactions.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
        print("Interactions-file with IDs translated to indexes (saved in df_interactions): ")
        print(self.df_interactions.head())
        print("\n")
        
        self.list_index_playlist = np.array(self.df_interactions['index_playlist'])
        self.list_index_track = np.array(self.df_interactions['index_track'])
        self.list_interactions = np.array(self.df_interactions['interaction'])
        
        self.df_tracks = pd.read_csv(meta_track, sep = '\t')
        self.df_tracks = self.df_tracks.merge(self.df_track_id_unique, how='inner', on='track_id')
        self.df_tracks['tags'] = self.df_tracks.tags.apply(json.loads)
        self.df_tracks['album'] = self.df_tracks.album.apply(lambda x: (str(x[1:-1]) + "a") if x != "[None]" and x != "[]" else "-10a")
        print('Meta information about tracks read (df_tracks): ')
        print(self.df_tracks.head())
        print(self.df_tracks.shape)
        
    def target_structure(self):
        # filter interaction dataframe, to retain only target playlists
        train = self.df_interactions.merge(self.df_target_playlists, how='inner', on='playlist_id')
        
        # aggregate to playlist level and coerce tracks in that playlist to list
        train_agg1 = train.groupby(by='playlist_id').track_id.apply(list).to_frame()
        train_agg1.reset_index(level=0, inplace=True)
        train_agg2 = train.groupby(by='playlist_id').index_track.apply(list).to_frame()
        train_agg2.reset_index(level=0, inplace=True)
        train_agg = train_agg1.merge(train_agg2, how='inner', on='playlist_id')
        self.df_target = train_agg.merge(self.df_playlist_id_unique, how='inner', on='playlist_id')
        self.df_target['recommend'] = np.empty((len(train_agg), 0)).tolist()
        print("Data structure for final prediction was created (df_target): ")
        print(self.df_target.head())
        print(self.df_target.shape)
        
    def interaction_aggregation(self):
        
        agg1 = self.df_interactions.groupby(by='playlist_id').track_id.apply(list).to_frame()
        agg1.reset_index(level=0, inplace=True)
        agg2 = self.df_interactions.groupby(by='playlist_id').index_track.apply(list).to_frame()
        agg2.reset_index(level=0, inplace=True)
        agg3 = self.df_interactions.groupby(by='playlist_id').nunique()
        agg3.reset_index(level=0, inplace=True)
        agg = agg1.merge(agg2, how='inner', on='playlist_id')
        agg = agg.merge(agg3, how='inner', on='playlist_id')
        print(agg[:10])
        
    def create_uim(self, sparse_mode="coo", create_testset = False, split = 0.8):
        if sparse_mode.lower() == "coo" or sparse_mode.lower() == "csr":
            self.UIM = sps.coo_matrix((self.list_interactions, (self.list_index_playlist, self.list_index_track)))
            if create_testset:
                self.split_traintest(train_test_split = split)
            if sparse_mode.lower() == "csr" and create_testset != True:
                self.UIM = self.UIM.tocsr()
            elif sparse_mode.lower() == "csr" and create_testset == True:
                self.UIM = self.UIM.tocsr()
                self.UIM_train = self.UIM_train.tocsr()
                self.UIM_test = self.UIM_test.tocsr()
                
        else:
            raise NotImplementedError('Sparse mode not implemented'.format(sparse_mode))
            
    def split_traintest(self, train_test_split):
        train_mask = np.random.choice([True,False], self.numInteractions, p=[train_test_split, 1-train_test_split])
        test_mask = np.logical_not(train_mask)
        self.UIM_train = sps.coo_matrix((self.list_interactions[train_mask], 
                                        (self.list_index_playlist[train_mask], 
                                         self.list_index_track[train_mask])))
        self.UIM_test = sps.coo_matrix((self.list_interactions, (self.list_index_playlist, self.list_index_track)))
        print("UIM successfully created in csr format.")
        
    def create_icm(self, include_tags = True, include_album = True, include_artist = True, include_playcount = False, include_duration = False, playcount_bins = 50, duration_bins = 3):
        tags_list = []
        
        if include_playcount:
            cbf.df_tracks['playcount'].fillna(0, inplace = True)
            cbf.df_tracks['playcount_bin'] = pd.qcut(cbf.df_tracks['playcount'], playcount_bins, duplicates = 'drop').astype('str')
        if include_duration:
            cbf.df_tracks['duration_bin'] = pd.qcut(cbf.df_tracks['duration'], duration_bins, duplicates = 'drop').astype('str')
            
        for index, row in self.df_tracks.iterrows():
            if len(row['tags']) != 0 and include_tags:
                for i in row['tags']:
                    tags_list.append([row['index_track'], i, 1.0])
            if row['album'] != "-10a" and include_album:
                tags_list.append([row['index_track'], row['album'], 1])
            if include_artist:
                tags_list.append([row['index_track'], str(row['artist_id']) + "b", 1.0])
            if include_playcount:
                tags_list.append([row['index_track'], row['playcount_bin'] + "x", 1.0])
            if include_duration and row['duration'] != -1:
                tags_list.append([row['index_track'], row['duration_bin'] + "z", 1.0])
        tags_list = pd.DataFrame(tags_list)
        tags_list.columns = ['index_track', 'tag', 'interaction']
        track_list = list(tags_list['index_track'])
        tag_list = list(tags_list['tag'])
        self.final_taglist = list(tags_list['tag'])
        interaction_list = list(tags_list['interaction'])
        le = preprocessing.LabelEncoder()
        le.fit(tag_list)
        taglist_icm = le.transform(tag_list)
        self.ICM = sps.coo_matrix((interaction_list, (track_list, taglist_icm)))
        self.ICM = self.ICM.tocsr()
        # append playcount and duration
        # if include_playcount:
        #     self.df_tracks['playcount'].fillna(0, inplace = True)
        #     self.ICM = sps.hstack((self.ICM, self.df_tracks[['index_track', 'playcount']].sort_values(by = 'index_track')['playcount'].values[:,None]))
        #     if include_duration:
        #         self.ICM = sps.hstack((self.ICM, self.df_tracks[['index_track', 'duration']].sort_values(by = 'index_track')['duration'].values[:,None]))
        #     self.ICM = self.ICM.tocsr()
        print("ICM successfully created in csr format.")
        
    def td_idf(self, ICM):
        '''Applies TD-IDF to the ICM Matrix of the RecommenderSystem Instance'''
        
        num_tot_items = ICM.shape[0]

        # let's count how many items have a certain feature
        items_per_feature = (ICM > 0).sum(axis=0)
        
        IDF = np.array(np.log(num_tot_items / items_per_feature))[0]
        
        print("Shape of IDF")
        print(IDF.shape)
        
        ICM_idf = ICM.copy()
        # compute the number of non-zeros in each col
        # NOTE: this works only if X is instance of sparse.csc_matrix
        col_nnz = np.diff(check_matrix(ICM_idf, 'csc').indptr)
        print("Shape of ICM_idf")
        print(ICM_idf.shape)
        # then normalize the values in each col
        ICM_idf.data *= np.repeat(IDF, col_nnz)
        
        return ICM_idf
        
    def svd(self, matrix, k = 100):
        
        U, s, Vt = svds(matrix, k)
        s_diag = np.diag(s)
        Us = np.dot(U, s_diag)
        # return sps.csr_matrix(np.dot(Us, Vt))
        return np.dot(Us, Vt)

In [5]:
%%cython

import time

import numpy as np
cimport numpy as np
from cpython.array cimport array, clone

import scipy.sparse as sps

cdef class Cosine_Similarity:

    cdef int TopK
    cdef long n_items

    # Arrays containing the sparse data
    cdef int[:] user_to_item_row_ptr, user_to_item_cols
    cdef int[:] item_to_user_rows, item_to_user_col_ptr
    cdef double[:] user_to_item_data, item_to_user_data

    # In case you select no TopK
    cdef double[:,:] W_dense

    
    def __init__(self, URM, TopK = 100):
        """
        Dataset must be a matrix with items as columns
        :param dataset:
        :param TopK:
        """

        super(Cosine_Similarity, self).__init__()

        self.n_items = URM.shape[1]

        self.TopK = min(TopK, self.n_items)

        URM = URM.tocsr()
        self.user_to_item_row_ptr = URM.indptr
        self.user_to_item_cols = URM.indices
        self.user_to_item_data = np.array(URM.data, dtype=np.float64)

        URM = URM.tocsc()
        self.item_to_user_rows = URM.indices
        self.item_to_user_col_ptr = URM.indptr
        self.item_to_user_data = np.array(URM.data, dtype=np.float64)

        if self.TopK == 0:
            self.W_dense = np.zeros((self.n_items,self.n_items))



    cdef int[:] getUsersThatRatedItem(self, long item_id):
        return self.item_to_user_rows[self.item_to_user_col_ptr[item_id]:self.item_to_user_col_ptr[item_id+1]]

    cdef int[:] getItemsRatedByUser(self, long user_id):
        return self.user_to_item_cols[self.user_to_item_row_ptr[user_id]:self.user_to_item_row_ptr[user_id+1]]

    
    
    cdef double[:] computeItemSimilarities(self, long item_id_input):
        """
        For every item the cosine similarity against other items depends on whether they have users in common. 
        The more common users the higher the similarity.
        
        The basic implementation is:
        - Select the first item
        - Loop through all other items
        -- Given the two items, get the users they have in common
        -- Update the similarity considering all common users
        
        That is VERY slow due to the common user part, in which a long data structure is looped multiple times.
        
        A better way is to use the data structure in a different way skipping the search part, getting directly
        the information we need.
        
        The implementation here used is:
        - Select the first item
        - Initialize a zero valued array for the similarities
        - Get the users who rated the first item
        - Loop through the users
        -- Given a user, get the items he rated (second item)
        -- Update the similarity of the items he rated
        
        
        """

        # Create template used to initialize an array with zeros
        # Much faster than np.zeros(self.n_items)
        cdef array[double] template_zero = array('d')
        cdef array[double] result = clone(template_zero, self.n_items, zero=True)


        cdef long user_index, user_id, item_index, item_id_second

        cdef int[:] users_that_rated_item = self.getUsersThatRatedItem(item_id_input)
        cdef int[:] items_rated_by_user

        cdef double rating_item_input, rating_item_second

        # Get users that rated the items
        for user_index in range(len(users_that_rated_item)):

            user_id = users_that_rated_item[user_index]
            rating_item_input = self.item_to_user_data[self.item_to_user_col_ptr[item_id_input]+user_index]

            # Get all items rated by that user
            items_rated_by_user = self.getItemsRatedByUser(user_id)

            for item_index in range(len(items_rated_by_user)):

                item_id_second = items_rated_by_user[item_index]

                # Do not compute the similarity on the diagonal
                if item_id_second != item_id_input:
                    # Increment similairty
                    rating_item_second = self.user_to_item_data[self.user_to_item_row_ptr[user_id]+item_index]

                    result[item_id_second] += rating_item_input*rating_item_second

        return result


    def compute_similarity(self):

        cdef int itemIndex, innerItemIndex
        cdef long long topKItemIndex

        cdef long long[:] top_k_idx

        # Declare numpy data type to use vetor indexing and simplify the topK selection code
        cdef np.ndarray[long, ndim=1] top_k_partition, top_k_partition_sorting
        cdef np.ndarray[np.float64_t, ndim=1] this_item_weights_np

        #cdef long[:] top_k_idx
        cdef double[:] this_item_weights

        cdef long processedItems = 0

        # Data structure to incrementally build sparse matrix
        # Preinitialize max possible length
        cdef double[:] values = np.zeros((self.n_items*self.TopK))
        cdef int[:] rows = np.zeros((self.n_items*self.TopK,), dtype=np.int32)
        cdef int[:] cols = np.zeros((self.n_items*self.TopK,), dtype=np.int32)
        cdef long sparse_data_pointer = 0


        start_time = time.time()

        # Compute all similarities for each item
        for itemIndex in range(self.n_items):

            processedItems += 1

            if processedItems % 10000==0 or processedItems==self.n_items:

                itemPerSec = processedItems/(time.time()-start_time)

                print("Similarity item {} ( {:2.0f} % ), {:.2f} item/sec, required time {:.2f} min".format(
                    processedItems, processedItems*1.0/self.n_items*100, itemPerSec, (self.n_items-processedItems) / itemPerSec / 60))

            this_item_weights = self.computeItemSimilarities(itemIndex)

            if self.TopK == 0:

                for innerItemIndex in range(self.n_items):
                    self.W_dense[innerItemIndex,itemIndex] = this_item_weights[innerItemIndex]

            else:

                # Sort indices and select TopK
                # Using numpy implies some overhead, unfortunately the plain C qsort function is even slower
                # top_k_idx = np.argsort(this_item_weights) [-self.TopK:]

                # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                # because we avoid sorting elements we already know we don't care about
                # - Partition the data to extract the set of TopK items, this set is unsorted
                # - Sort only the TopK items, discarding the rest
                # - Get the original item index

                this_item_weights_np = - np.array(this_item_weights)
                
                # Get the unordered set of topK items
                top_k_partition = np.argpartition(this_item_weights_np, self.TopK-1)[0:self.TopK]
                # Sort only the elements in the partition
                top_k_partition_sorting = np.argsort(this_item_weights_np[top_k_partition])
                # Get original index
                top_k_idx = top_k_partition[top_k_partition_sorting]



                # Incrementally build sparse matrix
                for innerItemIndex in range(len(top_k_idx)):

                    topKItemIndex = top_k_idx[innerItemIndex]

                    values[sparse_data_pointer] = this_item_weights[topKItemIndex]
                    rows[sparse_data_pointer] = topKItemIndex
                    cols[sparse_data_pointer] = itemIndex

                    sparse_data_pointer += 1


        if self.TopK == 0:

            return np.array(self.W_dense)

        else:

            values = np.array(values[0:sparse_data_pointer])
            rows = np.array(rows[0:sparse_data_pointer])
            cols = np.array(cols[0:sparse_data_pointer])

            W_sparse = sps.csr_matrix((values, (rows, cols)),
                                    shape=(self.n_items, self.n_items),
                                    dtype=np.float32)

            return W_sparse

In [6]:
class BasicCFKNNRecommender(RecommenderSystem):
       
    def __str__(self):
        return "ItemKNN(similarity={},k={},shrinkage={})".format(self.similarity_name, self.k, self.shrinkage)
    
    def apply_shrinkage(self, X, dist):
        # create an "indicator" version of X (i.e. replace values in X with ones)
        X_ind = X.copy()
        X_ind.data = np.ones_like(X_ind.data)
        # compute the co-rated counts
        co_counts = X_ind * X_ind.T
        # remove the diagonal
        co_counts = co_counts - sps.dia_matrix((co_counts.diagonal()[sp.newaxis, :], [0]), shape=co_counts.shape)
        # compute the shrinkage factor as co_counts_ij / (co_counts_ij + shrinkage)
        # then multiply dist with it
        co_counts_shrink = co_counts.copy()
        co_counts_shrink.data += self.shrinkage
        co_counts.data /= co_counts_shrink.data
        dist.data *= co_counts.data
        return dist
    
    def fit(self, shrinkage=100, similarity='cosine', k=100, remove_diag = False):
        self.shrinkage = shrinkage
        self.similarity_name = similarity
        self.k = k
        self.create_uim(sparse_mode = 'csr')
        
        if similarity == 'cosine':
            self.distance = Cosine_Similarity(self.UIM, k)
        else:
            raise NotImplementedError('Distance {} not implemented'.format(similarity))
        
        start_time = time.time()
        
        item_weights = self.distance.compute_similarity()
        
        print("Similarity computed in {:.2f} seconds".format(time.time()-start_time))
        
        # zero out diagonal values
        if remove_diag:
            item_weights = item_weights - sps.dia_matrix((item_weights.diagonal()[sp.newaxis, :], [0]), shape=item_weights.shape)
            print("Removed diagonal")
        
        # and apply the shrinkage
        if self.shrinkage > 0:
            item_weights = self.apply_shrinkage(self.UIM, item_weights)
            print("Applied shrinkage") 
        
        item_weights = check_matrix(item_weights, 'csr') # nearly 10 times faster
        print("Converted to csr")
        
        self.W = item_weights
        self.UIM_estm = self.UIM.dot(self.W)
        
        print('UIM_estm calculated')

    def recommend(self, at=5):
        self.target_structure()
        start_time = time.time()
        for index, row in self.df_target.iterrows():          
            #get row from URM_estm
            estm = pd.DataFrame(self.UIM_estm[row['index_playlist'],:].T.toarray())
            estm.reset_index(level=0, inplace=True)
            estm.columns = ['index_track','pred']
            # filter tracks which are already in the playlist, so they can't be recommended
            estm = estm[-estm["index_track"].isin(row['index_track'])]
            # translate track index back to track_id
            estm = estm.merge(self.df_track_id_unique, how='inner', on='index_track')
            # filter on target track set
            estm = estm[estm['track_id'].isin(self.list_target_tracks)]
            estm = estm.sort_values('pred',ascending=False)
            # print(estm)
            count = 1
            for index2, row2 in estm.iterrows():
                # insert 5 top recommendations into dataframe
                if count < (at + 1):
                    row['recommend'].append(int(row2['track_id']))
                    count += 1
                else:
                    break
        print("--- %s minutes ---" % ((time.time() - start_time)/60))
        
def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)

In [7]:
rs = BasicCFKNNRecommender()

Number of interactions (numInteractions): 1040522


Track_id translated to indexes (df_track_id_unique): 
   index_track  track_id
0            0   1048594
1            1   2359314
2            2   1835030
3            3   3670041
4            4   1048604


Playlist_id translated to indexes (df_playlist_id_unique): 
   index_playlist  playlist_id
0               0     10485762
1               1      5767174
2               2      7077894
3               3     11534344
4               4      1179658


Size of df_target_playlists: (10000, 2)
Size of df_target_tracks file: (32194, 2)
Size of list_target_tracks file: 32194


Number of Playlists: 45649
Number of Tracks: 99999


Interactions-file with IDs translated to indexes (saved in df_interactions): 
     playlist_id  track_id  interaction  index_playlist  index_track
0           7569    162463          1.0            2425        62358
87          7569    421750          1.0            2425        60999
116         7569    795606       

In [8]:
rs.fit(k = 200, shrinkage = 0)

Similarity item 10000 ( 10 % ), 1809.67 item/sec, required time 0.83 min
Similarity item 20000 ( 20 % ), 1820.73 item/sec, required time 0.73 min
Similarity item 30000 ( 30 % ), 1833.97 item/sec, required time 0.64 min
Similarity item 40000 ( 40 % ), 1821.53 item/sec, required time 0.55 min
Similarity item 50000 ( 50 % ), 1818.73 item/sec, required time 0.46 min
Similarity item 60000 ( 60 % ), 1826.39 item/sec, required time 0.37 min
Similarity item 70000 ( 70 % ), 1796.24 item/sec, required time 0.28 min
Similarity item 80000 ( 80 % ), 1800.61 item/sec, required time 0.19 min
Similarity item 90000 ( 90 % ), 1802.65 item/sec, required time 0.09 min
Similarity item 99999 ( 100 % ), 1805.16 item/sec, required time 0.00 min
Similarity computed in 64.05 seconds
Converted to csr
UIM_estm calculated


In [9]:
rs.UIM

<45649x99999 sparse matrix of type '<class 'numpy.float64'>'
	with 1040522 stored elements in Compressed Sparse Row format>

In [10]:
# calculate full item similarity matrix from UIM
# normalize UIM first
from sklearn.preprocessing import normalize
cf_UIM_norm = normalize(rs.UIM, norm='l2', axis=0)

In [11]:
cf_UIM_norm[:10,:10].todense()

matrix([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [12]:
rs.UIM[:10,:10].todense()

matrix([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [13]:
print(rs.UIM[1,:].max())
print(cf_UIM_norm[1,:].max())

1.0
0.57735026919


In [14]:
S_cf = cf_UIM_norm.T.dot(cf_UIM_norm)

In [15]:
S_cf[:10,:10].todense()

matrix([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [19]:
S_cf.max()
#print(rs.W.max())

1.0000000000000069

In [20]:
rs.W[:10,:10].todense()

matrix([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

In [21]:
S_cf

<99999x99999 sparse matrix of type '<class 'numpy.float64'>'
	with 60319471 stored elements in Compressed Sparse Row format>

In [22]:
UIM_estm = rs.UIM.dot(S_cf)

In [23]:
UIM_estm[:10,:10].todense()

matrix([[ 0.10063697,  0.09161872,  0.        ,  0.        ,  0.14433757,
          0.19692853,  0.07302967,  0.        ,  0.31310148,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.39027346,  0.        ,
          0.        ,  0.        ,  0.        ,  0.04287465,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.09534626,  0.        ,  0.        ],
        [ 0.04116935,  0.74914235,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.12844645,  0.        ,  0.        ],
        [ 0.03028913,  0.01333926,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.01446846,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.       

In [34]:
UIM_estm[1,:].nonzero()[1]

array([99599, 99508, 99411, ...,   271,    96,    63], dtype=int32)

In [35]:
rs.UIM_estm[1,:].nonzero()[1]

array([99950, 99599, 99508, ...,   502,   357,    96], dtype=int32)

In [24]:
rs.UIM_estm[:10,:10].todense()

matrix([[  1.,   0.,   0.,   0.,   1.,   1.,   2.,   0.,   5.,   0.],
        [  0.,   0.,   0.,   2.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.],
        [  0.,  16.,   0.,   0.,   0.,   0.,   0.,   2.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   1.,   0.,   0.,   0.,   0.,   2.,   1.,   5.,   1.],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   4.,   0.,   0.,   0.,   2.,   3.,   0.,   8.,   2.]])

In [36]:
rs.UIM_estm = UIM_estm

In [37]:
rs.recommend()

Data structure for final prediction was created (df_target): 
   playlist_id                                           track_id  \
0         7614  [415173, 1384962, 1609224, 1614974, 1714787, 2...   
1         7692  [88210, 266898, 280844, 302730, 384386, 551534...   
2         7816  [126414, 245217, 513821, 611201, 767305, 84510...   
3         8225  [13881, 261448, 311923, 500672, 676393, 906185...   
4         8337  [451881, 1157460, 1205536, 1210884, 3131838, 3...   

                                         index_track  index_playlist recommend  
0  [58038, 27294, 13601, 15634, 53615, 16590, 739...            2447        []  
1  [32883, 1416, 6262, 15120, 46326, 9696, 41315,...            2478        []  
2  [47911, 94663, 96789, 32334, 93906, 21551, 971...            2519        []  
3  [4523, 99812, 18243, 92598, 57550, 45423, 5806...            2679        []  
4          [74517, 40765, 59706, 62219, 95577, 7177]            2715        []  
(10000, 5)
--- 6.214323882261912 minu

In [39]:
# Convert list to string with spaces between track_ids
rs.df_target['recommend'] = rs.df_target['recommend'].apply(lambda x: " ".join(map(str, x)))

In [40]:
# rename columns for submission
final = rs.df_target[['playlist_id','recommend']]
final.columns = ['playlist_id','track_ids']

In [41]:
print(final.head())

   playlist_id                                track_ids
0         7614  1401050 2556767 2828149 2510128 3319136
1         7692  2537327 2289940 2610496 3545635 3422265
2         7816   2762406 2097922 2542321 544173 1882174
3         8225   2579225 2828149 2141167 112767 3687848
4         8337   1156143 2866519 3615514 3167821 656735


In [42]:
# export file
final.to_csv('../submission/008_cf_kFULL.csv', index=False)