In [211]:
# By Miguel Maricalva and Philip Claesson

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
%matplotlib inline

#train_final.csv - the training set of interactions
train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

#tracks_final.csv - supplementary information about the items
tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

#playlists_final.csv - supplementary information about the users
playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

#target_playlists.csv - the set of target playlists that will receive recommendations
target_playlists = pd.read_csv('input/target_playlists.csv');

#target_tracks.csv - the set of target items (tracks) to be recommended
target_tracks = pd.read_csv('input/target_tracks.csv');

#Let's have a look at the train data. 
train_final.head()

Unnamed: 0,playlist_id,track_id
0,3271849,2801526
1,5616275,727878
2,11267488,2805283
3,10103900,1515105
4,3836898,2945623


In [212]:
tracks_final.head()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]"


In [213]:
#Add occurence_count to the tracks_final dataset
tracks_final["occurence_count"] = 0

tracks_final.head()



Unnamed: 0,track_id,artist_id,duration,playcount,album,tags,occurence_count
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]",0
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]",0
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]",0
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]",0
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]",0


In [214]:
#Popularity is now a dataframe
popularity = train_final.groupby(by="track_id").playlist_id.nunique().to_frame()

#remove index name
popularity.reset_index(level = 0, inplace = True)

#Rename the columns
popularity.columns = ['track_id','occurences']

#Sort by occurences
popularity = popularity.sort_values('occurences', ascending=False)

#sns.distplot(popularity['occurences'], bins = 100);

In [215]:
#Lets have a look at the playlist data
playlists_final.describe()

Unnamed: 0,created_at,playlist_id,numtracks,duration,owner
count,57561.0,57561.0,57561.0,57561.0,57561.0
mean,1272374000.0,6505517.0,30.738816,8079.101075,27341.217022
std,53712200.0,3049124.0,42.731612,11852.823672,14431.376628
min,1169657000.0,7569.0,0.0,0.0,3.0
25%,1232933000.0,4218783.0,5.0,1260.0,14717.0
50%,1267043000.0,6596396.0,13.0,3359.0,29811.0
75%,1307556000.0,8650848.0,36.0,9224.0,41162.0
max,1423787000.0,11766360.0,200.0,439332.0,45169.0


In [None]:
#Create a playlist/track-matrix where every row 

zeros = np.zeros((playlists_final['playlist_id'].size, 201), dtype = int)

#playlist_item_matrix = pd.DataFrame(zeros)

playlist_item_matrix = zeros

playlist_index_counter = np.ones(playlists_final['playlist_id'].size)

playlist_item_matrix[:,0] = (playlists_final['playlist_id'])

train_final_np = train_final.as_matrix()

for row in train_final_np: 
    #get from row: 
    playlist_id = row[0]
    track_id = row[1]
    
    #find the row as an array
    playlist_index = np.where(playlist_item_matrix == playlist_id)[0][0]
    
    #the row of playlist_index at col from counter = track_id
    
    playlist_item_matrix[playlist_index, int(playlist_index_counter[playlist_index])] = track_id
    playlist_index_counter[playlist_index] += 1

#convert to a sparse matrix
sparse_playlist_item_matrix = sps.csr_matrix(playlist_item_matrix)

#Save as a compressed file. 
sps.save_npz("sparse_playlist_item_matrix", sparse_playlist_item_matrix)


print(pd.DataFrame(playlist_item_matrix))

In [157]:
def remove_seen_items(recommended_items, playlist_id):
    #Takes an array of recommended items and a playlist id. Returns an array of recommended items, 
    #without any items that already occur in the playlist. 
    
    #Get all the items of this playlist. This step could be made more efficient. 
    playlist_items = train_final[train_final['playlist_id']==playlist_id]

    #Create a filter to remove already seen items
    #unseen_item_mask is a Boolean vector with False=item already seen and True = item not seen.
    unseen_item_mask = np.isin(popularity['track_id'], playlist_items, assume_unique = True, invert = True)

    #Filter the popularity vector
    unseen_items = popularity[unseen_item_mask]
    
    return unseen_items
    

In [164]:
def recommend(playlist_id, n, remove_seen = False): 
    #Returns an array of n recommendations for playlist with playlist_id
    if remove_seen_items:
        
        #Simple top popular recommender system
        recommended_items = popularity[:20]

        unseen_items = remove_seen_items(recommended_items, playlist_id)
        
        return unseen_items.iloc[0:n]['track_id'].as_matrix()
        
    else: 
        return popularity.iloc[0:n]['track_id'].as_matrix()


In [165]:

###Callin Recommend function, filling it into a DataFrame. ###
###This part should not be changed ##

zeros = np.zeros((target_playlists.size, 6), dtype = int)

#Create empty dataframe
recommendations = pd.DataFrame(zeros)

#Rename the first col
recommendations.columns = ['playlist_id', 1, 2, 3, 4, 5]

#recommendations.iloc[:, 0] = target_playlists['playlist_id']




#Fill the recommendations matrix through calling the recommend-function
counter = 0; 
for playlist_id in target_playlists['playlist_id']:
    #Add the playlist ids as first col
    recommendations.iloc[counter, 0] = playlist_id
    
    #Fill the recommendations to col 1-5 for each playlist
    recommendations.iloc[counter, 1:6] = recommend(playlist_id, 5, True)
    counter += 1

recommendations    





Unnamed: 0,playlist_id,1,2,3,4,5
0,10024884,1563309,1363985,3705881,1595978,3166665
1,10624787,1563309,1363985,3705881,1595978,3166665
2,4891851,1563309,1363985,3705881,1595978,3166665
3,4267369,1563309,1363985,3705881,1595978,3166665
4,65078,1563309,1363985,3705881,1595978,3166665
5,10637124,1563309,1363985,3705881,1595978,3166665
6,3223162,1563309,1363985,3705881,1595978,3166665
7,7541503,1563309,1363985,3705881,1595978,3166665
8,6189367,1563309,1363985,3705881,1595978,3166665
9,8459943,1563309,1363985,3705881,1595978,3166665


In [156]:
def save_to_file():
    #Saves the recommendations dataframe to the .csv-file. 
    np.savetxt("recommendations.csv",recommendations, fmt = '%s,%s %s %s %s %s', header = "playlist_id,track_ids", newline = "\n")
    
    
def test():
    #Do something
    print("Result: ")
    pass


save_to_file()