In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
%matplotlib inline

#train_final.csv - the training set of interactions
train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

#tracks_final.csv - supplementary information about the items
tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

#playlists_final.csv - supplementary information about the users
playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

#target_playlists.csv - the set of target playlists that will receive recommendations
target_playlists = pd.read_csv('input/target_playlists.csv');

#target_tracks.csv - the set of target items (tracks) to be recommended
target_tracks = pd.read_csv('input/target_tracks.csv');

#Let's have a look at the train data. 
train_final.head()

Unnamed: 0,playlist_id,track_id
0,3271849,2801526
1,5616275,727878
2,11267488,2805283
3,10103900,1515105
4,3836898,2945623


In [3]:
target_tracks.shape[0]

32195

In [41]:
item_playlist_matrix = np.zeros([playlists_final.shape[0], tracks_final.shape[0]]) 

In [43]:
#The matrix has playlists as rows and items as columns
item_playlist_matrix.size

5756100000

In [52]:
#This is too big.. We need to remove some items! 

#Lets remove all tracks whith less than 10 in popularity, and add the rest to a set called relevant_tracks
#Popularity is now a dataframe
popularity = train_final.groupby(by="track_id").playlist_id.nunique().to_frame()

#remove index name
popularity.reset_index(level = 0, inplace = True)

#Rename the columns
popularity.columns = ['track_id','occurrences']

In [123]:
#filtering all tracks occurring less than 10 times gives 23298 rows

#filtering all tracks occurring less than 20 times gives 9489 rows

tracks_relevant = popularity[popularity['occurrences'] > 20]

In [124]:
item_playlist_matrix = np.zeros([playlists_final.shape[0], tracks_relevant.shape[0]]) 

In [125]:
#This saves us a factor of 10, but we still have a huge matrix. 
item_playlist_matrix.shape

(57561, 9489)

In [126]:
# Now we should be able to filter away interactions with all tracks that we removed. 
train_relevant = train_final[train_final['track_id'].isin(tracks_relevant['track_id'])]
train_relevant.describe()

Unnamed: 0,playlist_id,track_id
count,436862.0,436862.0
mean,6530891.0,1992278.0
std,3079244.0,1141203.0
min,7569.0,1450.0
25%,4184420.0,1056176.0
50%,6663308.0,2020026.0
75%,8691742.0,2924245.0
max,11766360.0,4936513.0


In [127]:
#Let's not throw away playlists!  see if we can throw away some playlists. 

#playlist_occurrences = train_relevant.groupby(by="playlist_id").track_id.nunique().to_frame()

#remove index name
#playlist_occurrences.reset_index(level = 0, inplace = True)

#Rename the columns
#playlist_occurrences.columns = ['playlist_id','occurrences']

In [128]:
#Playlists with more than 5 songs: 17173

#playlists_relevant = playlist_occurrences[playlist_occurrences['occurrences'] > 5]
#playlists_relevant.describe()

In [131]:
#Let's try again.  

#We have to add +1 because row/col 0 will be taken. 
item_playlist_matrix = np.zeros([playlists_final.shape[0]+1, tracks_relevant.shape[0]+1]) 
print(item_playlist_matrix.size)
print(item_playlist_matrix.shape)

546263380
(57562, 9490)


In [135]:
#Now, we should filter the train_final so it only contains relevant stuff.

playlists_relevant = pd.concat([playlists_relevant, target_playlists])

train_relevant = train_final[train_final['playlist_id'].isin(playlists_relevant['playlist_id'])]

#Easy. Now it is down to 436862 interactions, should not be too slow. 
#train_relevant.head()

train_relevant.describe()

Unnamed: 0,playlist_id,track_id
count,882670.0,882670.0
mean,6526798.0,1987094.0
std,3086458.0,1136636.0
min,7614.0,252.0
25%,4181885.0,1053281.0
50%,6647196.0,2015358.0
75%,8693782.0,2911595.0
max,11766360.0,5018274.0


In [147]:
#If we translate each track_id to a track_index which will serve as matrix index, we can save a lot of time. 
#Same goes for playlist_id --> playlist_index. 


#We need a way to get from track_id to index in O(1).
#Let's create a dictionary

track_indexes = {}
counter = 1; #Index 0 is reserved for playlist_ids
for track_id in tracks_relevant['track_id']:
    track_indexes[track_id] = counter
    counter += 1;
    
#and a way to get from playlist_id to index in O(1)

playlists_relevant

playlist_indexes = {}
counter = 1; #Index 0 is reserved for track_ids
for playlist_id in playlists_relevant['playlist_id']:
    playlist_indexes[playlist_id] = counter
    counter += 1;

#felsökning
print(playlists_relevant[playlists_relevant['playlist_id']==1515105])

#Now, in order to get a playlist_index we just go: playlist_index = playlist_indexes[playlist_id]
    
#How do we get it back in the end? We simply keep the tracks_relevant and playlist_relevant and access them by index in the end. 
# tracks_relevant[playlist_index] = playlist_id

Empty DataFrame
Columns: [occurrences, playlist_id]
Index: []


In [143]:
train_relevant = train_final[train_final['playlist_id'].isin(playlists_relevant['playlist_id'])]
train_relevant.shape

(882670, 2)

In [144]:
#Lets build that matrix. 

interactions = train_relevant.as_matrix()
for row in interactions:
    #Lets get the info
    playlist_id = row[0]
    track_id = row[1]
    
    #Now lets get the proper indexes. 
    playlist_index = playlist_indexes[playlist_id]
    track_index = track_indexes[track_id]
    
    #And now lets add it to the matrix
    item_playlist_matrix[playlist_index][track_index] = 1
    

item_playlist_matrix
    


KeyError: 1515105