In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
%matplotlib inline

#train_final.csv - the training set of interactions
train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

#tracks_final.csv - supplementary information about the items
tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

#playlists_final.csv - supplementary information about the users
playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

#target_playlists.csv - the set of target playlists that will receive recommendations
target_playlists = pd.read_csv('input/target_playlists.csv');

#target_tracks.csv - the set of target items (tracks) to be recommended
target_tracks = pd.read_csv('input/target_tracks.csv');


(41756, 2)

In [3]:
#Now we want to remove some redundant stuff. 

#We will remove all songs which are not occurring more than 10 times in train_final
#Nevertheless, we still want to keep all tracks which are in the target tracks.  

popularity = train_final.groupby(by="track_id").playlist_id.nunique().to_frame()

#remove index name
popularity.reset_index(level = 0, inplace = True)

#Rename the columns
popularity.columns = ['track_id','occurrences']

#Remove all targeted tracks - TESTED, working as expected
tracks_relevant = popularity[~popularity['track_id'].isin(target_tracks['track_id'])]

#Remove tracks occurring less than 10 times
tracks_relevant = tracks_relevant[tracks_relevant['occurrences'] > 10]

#Add the targeteted tracks back again
tracks_relevant = pd.concat([tracks_relevant, target_tracks])

tracks_relevant.shape


(41756, 2)

In [4]:
#Now lets take a look at the tags.
tracks_final.head()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]"


In [5]:
#Lets translate the tags to indexes. 
tracks_final['tags'].head()


tags_indexes = {}
counter = 1; #We will start at 1, reserving col 0 for indexes. 
for row in tracks_final['tags']:
    tags = row.strip('[ ]').split(', ')
    for tag in tags:
        if len(tag) > 0: 
            tag = int(tag)
            if not(tag in tags_indexes):
                tags_indexes[tag] = counter #Lets make it int to make it easier
                counter += 1;

print(len(tags_indexes))

31900


In [6]:
#If we translate each track_id to a track_index which will serve as matrix index, we can save a lot of time. 


#We need a way to get from track_id to index in O(1).
#Let's create a dictionary

tracks_indexes = {}
index_to_item = {}
counter = 1; #We will start at 1, reserving col 0 for indexes.  
for track_id in tracks_relevant['track_id']:
    tracks_indexes[track_id] = counter
    counter += 1;
    
    
print("We have {} unique tracks with {} unique tags".format(len(tracks_indexes), len(tags_indexes)))


We have 41756 unique tracks with 31900 unique tags


In [7]:
#Now we can create an Item Content Matrix. 

ICM_all = np.zeros((len(tracks_indexes)+1, len(tags_indexes)+1), int)
print(ICM_all.shape)



(41757, 31901)


In [8]:
#tracks_relevant.reset_index(level = 0, inplace = True)

tracks = pd.merge(tracks_relevant, tracks_final, how='inner', on='track_id')

print(tracks.head())

tracks = tracks.as_matrix()

   occurrences  track_id  artist_id  duration  playcount     album  \
0         18.0       360     169649    194000      522.0   [77662]   
1         11.0      1376     381303    270000      629.0  [180587]   
2         12.0      2623      36019    329000     7081.0   [17295]   
3         15.0      2891     310373    194000      196.0  [142650]   
4         14.0      2901     163720    293000      322.0   [74384]   

                                      tags  
0      [70618, 70251, 70625, 25307, 11056]  
1  [205245, 254105, 11056, 209598, 189631]  
2    [122769, 23214, 48976, 117167, 90254]  
3     [189631, 54087, 70618, 94794, 61837]  
4     [193464, 205245, 46208, 92324, 3982]  


In [9]:
counter = 1
for tag in tags_indexes:
    ICM_all[0,counter] = tag
    counter += 1
    
counter = 1
for track_id in tracks_indexes: 
    ICM_all[counter, 0] = track_id
    counter += 1
    
print(ICM_all)

[[      0   54087    1757 ...,   87957  275594  276430]
 [    360       0       0 ...,       0       0       0]
 [   1376       0       0 ...,       0       0       0]
 ..., 
 [2739213       0       0 ...,       0       0       0]
 [2228646       0       0 ...,       0       0       0]
 [2265463       0       0 ...,       0       0       0]]


In [12]:
#We will put the track_id/tag in the [0]-column.

#So let's fill it with our data.


for row in tracks: 
    track_id = row[1]
    tags = row[6].strip('[ ]').split(', ')
    for tag in tags:
        if tag == '': 
            print(row)
        tag_index = tags_indexes[int(tag)]
        track_index = tracks_indexes[track_id]
        ICM_all[track_index,tag_index] = 1
        #print("Added tag {} to track {}".format(tag, track_id))
            

print(ICM_all)

ValueError: invalid literal for int() with base 10: ''

In [176]:
ICM_all_sparse = sps.csc_matrix(ICM_all)

In [178]:
ICM_all_sparse

<41757x31901 sparse matrix of type '<class 'numpy.int64'>'
	with 74121 stored elements in Compressed Sparse Column format>

In [180]:
#Lets take a look at the data. 

features_per_item = (ICM_all_sparse > 0).sum(axis=1)
items_per_feature = (ICM_all_sparse > 0).sum(axis=0)

features_per_item = np.sort(features_per_item)
items_per_feature = np.sort(items_per_feature)

print(features_per_item.shape)
print(items_per_feature.shape)

(41757, 1)
(1, 31901)


In [None]:
#Let's Split the Data and create evluation functions. 

train_test_split = 0.80

numInteractions = URM_all.nnz


train_mask = np.random.choice([True,False], numInteractions, [train_test_split, 1-train_test_split])

userList = np.array(userList)
itemList = np.array(itemList)
ratingList = np.array(ratingList)


URM_train = sps.coo_matrix((ratingList[train_mask], (userList[train_mask], itemList[train_mask])))
URM_train = URM_train.tocsr()

test_mask = np.logical_not(train_mask)

URM_test = sps.coo_matrix((ratingList[test_mask], (userList[test_mask], itemList[test_mask])))
URM_test = URM_test.tocsr()

def precision(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def MAP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

def evaluate_algorithm(URM_test, recommender_object, at=5):
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0
    
    num_eval = 0


    for i,user_id in  enumerate(userList_unique):
        
        if i % 500 == 0:
            print("User %d of %d" % (i, len(userList_unique)))

        relevant_items = URM_test[user_id].indices
        
        if len(relevant_items)>0:
            
            recommended_items = recommender_object.recommend(user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_MAP += MAP(recommended_items, relevant_items)


    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval
    
    print("Recommender performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP))

