In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
%matplotlib inline

#train_final.csv - the training set of interactions
train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

#tracks_final.csv - supplementary information about the items
tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

#playlists_final.csv - supplementary information about the users
playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

#target_playlists.csv - the set of target playlists that will receive recommendations
target_playlists = pd.read_csv('input/target_playlists.csv');

#target_tracks.csv - the set of target items (tracks) to be recommended
target_tracks = pd.read_csv('input/target_tracks.csv');

#Let's have a look at the train data. 
train_final.head()

Unnamed: 0,playlist_id,track_id
0,3271849,2801526
1,5616275,727878
2,11267488,2805283
3,10103900,1515105
4,3836898,2945623


In [3]:
#Now we need to remove some redundant stuff. 

#We will remove all song which are not: 1. occurring more than 10 times in train_final and 2. not in the target_tracks. 

popularity = train_final.groupby(by="track_id").playlist_id.nunique().to_frame()

#remove index name
popularity.reset_index(level = 0, inplace = True)

#Rename the columns
popularity.columns = ['track_id','occurrences']

#Remove all targeted tracks - TESTED, working as expected
tracks_relevant = popularity[~popularity['track_id'].isin(target_tracks['track_id'])]

#Remove tracks occurring less than 10 times
tracks_relevant = tracks_relevant[tracks_relevant['occurrences'] > 10]

#Add the targeteted tracks back again
tracks_relevant = pd.concat([tracks_relevant, target_tracks])

tracks_relevant.shape


(41756, 2)

In [4]:
#We will remove all playlists which are not: 1. containing more than 5 tracks and 2. not in the target_playlists.

playlists_sizes = train_final.groupby(by="playlist_id").track_id.nunique().to_frame()

#remove index name
playlists_sizes.reset_index(level = 0, inplace = True)

#Rename the columns
playlists_sizes.columns = ['playlist_id','size']

print(playlists_sizes.shape)

#Remove all targeted playlists TESTED works
playlists_relevant = playlists_sizes[~playlists_sizes['playlist_id'].isin(target_playlists['playlist_id'])]

#Remove playlists of size less than 10
playlists_relevant = playlists_relevant[playlists_relevant['size'] > 10]

#Add the targeteted playlists back again
playlists_relevant = pd.concat([playlists_relevant, target_playlists])

print(playlists_relevant.shape)


#WORKING! 

(45649, 2)
(23618, 2)


In [5]:
#Now we have to create a set of the relevant train data. 


print(train_final.shape)

train_relevant = train_final[train_final['track_id'].isin(tracks_relevant['track_id'])]

print(train_relevant.shape)

train_relevant = train_relevant[train_final['playlist_id'].isin(playlists_relevant['playlist_id'])]

print(train_relevant.shape)


(1040522, 2)
(731373, 2)
(667033, 2)


  # Remove the CWD from sys.path while we load stuff.


In [41]:
item_playlist_matrix = np.zeros([playlists_relevant.shape[0], tracks_relevant.shape[0]],int) 

In [42]:
#Very large matrix filled with zeros.
#Old size before removing used to be 5.756.100.000
#New size: 986.193.208
item_playlist_matrix.size

986258583

In [44]:
#If we translate each track_id to a track_index which will serve as matrix index, we can save a lot of time. 
#Same goes for playlist_id --> playlist_index. 


#We need a way to get from track_id to index in O(1).
#Let's create a dictionary

track_indexes = {}
index_to_item = {}
counter = 0; 
for track_id in tracks_relevant['track_id']:
    item_playlist_matrix[0][counter] = track_id
    track_indexes[track_id] = counter
    counter += 1;
    
#and a way to get from playlist_id to index in O(1)


playlist_indexes = {}
index_to_playlist = {}
counter = 0; 
for playlist_id in playlists_relevant['playlist_id']:
    item_playlist_matrix[counter][0] = playlist_id
    playlist_indexes[playlist_id] = counter
    counter += 1;

#felsökning
#print(playlists_relevant[playlists_relevant['playlist_id']==1515105])
print("hej1")

hej1


In [45]:
#Lets build that matrix. 

interactions = train_relevant.as_matrix()
for row in interactions:
    #Lets get the info
    playlist_id = row[0]
    track_id = row[1]
    
    #Now lets get the proper indexes. 
    playlist_index = playlist_indexes[playlist_id]
    track_index = track_indexes[track_id]
    
    #And now lets add it to the matrix
    item_playlist_matrix[playlist_index][track_index] = 1
    

print(item_playlist_matrix)
print("hej2")

[[   7912    1376    2623 ..., 2228646 2265463       0]
 [   8268       0       0 ...,       0       0       0]
 [   8900       0       0 ...,       0       0       0]
 ..., 
 [7939535       0       0 ...,       0       0       0]
 [ 297021       0       0 ...,       0       0       0]
 [1502409       0       0 ...,       0       0       0]]
hej2


In [46]:
#Lets just extract the subset of the matrix that does not contain ids. 
print(item_playlist_matrix[1:,1:])

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [47]:
#Now we have a item_playlist_matrix! Nice. lets save. 

sparse_matrix = sps.csr_matrix(item_playlist_matrix[1:,1:])

#sps.save_npz("sparse_item_playlist", sparse_matrix)

print(sparse_matrix.shape)
print("tje2")


(23618, 41756)
tje2


In [48]:
#If we multiply the matrix with its transposition, we will get an item similarity matrix. 


playlist_similarities = sparse_matrix.dot(sparse_matrix.transpose())
print(playlist_similarities)
print("lgt2")


  (0, 22915)	1
  (0, 22534)	1
  (0, 22415)	1
  (0, 22188)	1
  (0, 21644)	1
  (0, 21128)	1
  (0, 21109)	1
  (0, 20702)	1
  (0, 20290)	1
  (0, 20219)	1
  (0, 20162)	1
  (0, 20117)	1
  (0, 19755)	1
  (0, 19461)	1
  (0, 19185)	1
  (0, 18725)	1
  (0, 18162)	1
  (0, 17924)	1
  (0, 16332)	1
  (0, 14892)	1
  (0, 14586)	1
  (0, 14258)	1
  (0, 14191)	1
  (0, 13811)	1
  (0, 12855)	1
  :	:
  (23616, 3063)	1
  (23616, 2413)	1
  (23616, 1728)	5
  (23616, 799)	2
  (23616, 23616)	80
  (23616, 23507)	1
  (23616, 22621)	1
  (23616, 20823)	3
  (23616, 19421)	1
  (23616, 18593)	2
  (23616, 14495)	1
  (23616, 13701)	1
  (23616, 13015)	3
  (23616, 12460)	1
  (23616, 12003)	1
  (23616, 11615)	3
  (23616, 11220)	4
  (23616, 6310)	1
  (23616, 5075)	6
  (23616, 4405)	1
  (23616, 4118)	1
  (23616, 3842)	1
  (23616, 2388)	2
  (23616, 1866)	1
  (23616, 708)	1
lgt2


In [49]:
#Let's normalize the matrix
#playlist_similarities = playlist_similarities.multiply(1000/playlist_similarities.max())
print(playlist_similarities.shape)

#(23619, 23619)

(23618, 23618)


In [38]:
#Here we can get the similarities between two playlists.   
print(np.asarray(playlist_similarities.getrow(50).todense())[0][100])
print(np.asarray(playlist_similarities.getrow(100).todense())[0][50])

0.0
0.0


In [50]:
def playlist_similarity(playlist_id1, playlist_id2):    
    similarity = np.asarray(playlist_similarities.getrow(playlist_indexes[playlist_id1]).todense())[0][playlist_indexes[playlist_id2]]
    return similarity

In [65]:
#How many tracks do we want to work with? 
tracks = popularity.sort_values(by='occurrences', ascending=False)[:100]
#remove index name
tracks.reset_index(level = 0, inplace = True)
#Rename the columns
tracks.columns = ['relevance','track_id','occurrences']

In [76]:
def recommend(target_playlist_id, tracks = []):
    playlist_index = playlist_indexes[target_playlist_id]
    
    #Output vector
    recommendations = np.zeros([1,5], int)
    #Datastructure for relevance. 
    relevance = np.zeros([tracks.shape[0],1], float)
    

    
    track_counter = 0
    #For each song not in the playlist (we will start with top 100 popular)
    tracks = tracks.as_matrix()
    for track in tracks:
        sum = 0
        track_id = track[1]
        track_index = track_indexes[track_id]

        #Get all playlists containing this track. 
        playlists_with_track = (item_playlist_matrix[item_playlist_matrix.T[track_index][:]==1])

        playlists_with_track = playlists_with_track[:-1, :-1]
        
        #for each playlist containing the song
        for playlist in playlists_with_track:
            playlist_id = playlist[0]
            if(playlist_id > 1): #weird workaround... 
                sum += playlist_similarity(target_playlist_id, playlist_id)
        relevance[track_counter] = sum/track[2]  #Normalize. track[2] is the number of playlists containing the song. 
        track_counter += 1


        #relevance = sum/num of playlists containing the song
    found = 0
    while found < 5: 
        maxindex = np.argmax(relevance, axis = 0)[0]
        #print(tracks[maxindex][1])
        recommendations[0, found] = tracks[maxindex, 1]
        relevance[maxindex] *= -1
        found += 1
        
    #print(tracks)
    #print(relevance)

    return recommendations
print(recommend(10024884, tracks))


[[1193299 2158207 1209729  853629 2609171]]


#Get all playlists which contain a certain track:
playlists = (item_playlist_matrix[item_playlist_matrix.T[1666][:]==1])

In [78]:
#How many tracks do we want to work with? 
tracks = popularity.sort_values(by='occurrences', ascending=False)[:100]
#remove index name
tracks.reset_index(level = 0, inplace = True)
#Rename the columns
tracks.columns = ['relevance','track_id','occurrences']


###Callin Recommend function, filling it into a DataFrame. ###
###This part should not be changed ##

zeros = np.zeros((target_playlists.size, 6), dtype = int)

#Create empty dataframe
recommendations = pd.DataFrame(zeros)

#Rename the first col
recommendations.columns = ['playlist_id', 1, 2, 3, 4, 5]

#recommendations.iloc[:, 0] = target_playlists['playlist_id']


print(target_playlists[1:5]['playlist_id'])

import time
starttime = time.time()

#Fill the recommendations matrix through calling the recommend-function
counter = 0; 
for playlist_id in target_playlists[1:10]['playlist_id']:
    #Add the playlist ids as first col
    recommendations.iloc[counter, 0] = playlist_id
    #print(playlist_id)
    #Fill the recommendations to col 1-5 for each playlist
    recommendations.iloc[counter, 1:6] = recommend(playlist_id, tracks)
    counter += 1

#print(recommendations)
runtime = time.time()-starttime

hours = 1000*runtime/3600

print("Ten recommendations took ",runtime," seconds. 10000 would take ", hours, " hours... ")

def save_to_file():
    #Saves the recommendations dataframe to the .csv-file. 
    np.savetxt("recommendations.csv",recommendations, fmt = '%s,%s %s %s %s %s', header = "playlist_id,track_ids", newline = "\n")
    
save_to_file()

1    10624787
2     4891851
3     4267369
4       65078
Name: playlist_id, dtype: int64
Ten recommendations took  64.15566205978394  seconds. 10000 would take  17.821017238828873  hours... 


In [None]:
def save_to_file():
    #Saves the recommendations dataframe to the .csv-file. 
    np.savetxt("recommendations.csv",recommendations, fmt = '%s,%s %s %s %s %s', header = "playlist_id,track_ids", newline = "\n")
    
    
def test():
    #Do something
    print("Result: ")
    pass


save_to_file()