In [52]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
%matplotlib inline

#train_final.csv - the training set of interactions
train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

#tracks_final.csv - supplementary information about the items
tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

#playlists_final.csv - supplementary information about the users
playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

#target_playlists.csv - the set of target playlists that will receive recommendations
target_playlists = pd.read_csv('input/target_playlists.csv');

#target_tracks.csv - the set of target items (tracks) to be recommended
target_tracks = pd.read_csv('input/target_tracks.csv');

#Let's have a look at the train data. 
train_final.head()

Unnamed: 0,playlist_id,track_id
0,3271849,2801526
1,5616275,727878
2,11267488,2805283
3,10103900,1515105
4,3836898,2945623


In [53]:
#Now we need to remove some redundant stuff. 

#We will remove all song which are not: 1. occurring more than 10 times in train_final and 2. not in the target_tracks. 

popularity = train_final.groupby(by="track_id").playlist_id.nunique().to_frame()

#remove index name
popularity.reset_index(level = 0, inplace = True)

#Rename the columns
popularity.columns = ['track_id','occurrences']

#Remove all targeted tracks - TESTED, working as expected
tracks_relevant = popularity[~popularity['track_id'].isin(target_tracks['track_id'])]

#Remove tracks occurring less than 10 times
tracks_relevant = tracks_relevant[tracks_relevant['occurrences'] > 10]

#Add the targeteted tracks back again
tracks_relevant = pd.concat([tracks_relevant, target_tracks])

tracks_relevant.shape


(41756, 2)

In [54]:
#We will remove all playlists which are not: 1. containing more than 5 tracks and 2. not in the target_playlists.

playlists_sizes = train_final.groupby(by="playlist_id").track_id.nunique().to_frame()

#remove index name
playlists_sizes.reset_index(level = 0, inplace = True)

#Rename the columns
playlists_sizes.columns = ['playlist_id','size']

print(playlists_sizes.shape)

#Remove all targeted playlists TESTED works
playlists_relevant = playlist_sizes[~playlist_sizes['playlist_id'].isin(target_playlists['playlist_id'])]

#Remove playlists of size less than 10
playlists_relevant = playlists_relevant[playlists_relevant['size'] > 10]

#Add the targeteted playlists back again
playlists_relevant = pd.concat([playlists_relevant, target_playlists])

print(playlists_relevant.shape)


#WORKING! 

(45649, 2)
(23618, 2)


In [55]:
#Now we have to create a set of the relevant train data. 


print(train_final.shape)

train_relevant = train_final[train_final['track_id'].isin(tracks_relevant['track_id'])]

print(train_relevant.shape)

train_relevant = train_relevant[train_final['playlist_id'].isin(playlists_relevant['playlist_id'])]

print(train_relevant.shape)


(1040522, 2)
(731373, 2)
(667033, 2)


  # Remove the CWD from sys.path while we load stuff.


In [56]:
item_playlist_matrix = np.zeros([playlists_relevant.shape[0], tracks_relevant.shape[0]]) 

In [57]:
#Very large matrix filled with zeros.
#Old size before removing used to be 5.756.100.000
#New size: 986.193.208
item_playlist_matrix.size

986193208

In [58]:
#If we translate each track_id to a track_index which will serve as matrix index, we can save a lot of time. 
#Same goes for playlist_id --> playlist_index. 


#We need a way to get from track_id to index in O(1).
#Let's create a dictionary

track_indexes = {}
counter = 0; 
for track_id in tracks_relevant['track_id']:
    track_indexes[track_id] = counter
    counter += 1;
    
#and a way to get from playlist_id to index in O(1)


playlist_indexes = {}
counter = 0; 
for playlist_id in playlists_relevant['playlist_id']:
    playlist_indexes[playlist_id] = counter
    counter += 1;

#felsökning
#print(playlists_relevant[playlists_relevant['playlist_id']==1515105])

#Now, in order to get a playlist_index we just go: playlist_index = playlist_indexes[playlist_id]
    
#How do we get it back in the end? We simply keep the tracks_relevant and playlist_relevant and access them by index in the end. 
# tracks_relevant[playlist_index] = playlist_id



In [59]:
#Lets build that matrix. 

interactions = train_relevant.as_matrix()
for row in interactions:
    #Lets get the info
    playlist_id = row[0]
    track_id = row[1]
    
    #Now lets get the proper indexes. 
    playlist_index = playlist_indexes[playlist_id]
    track_index = track_indexes[track_id]
    
    #And now lets add it to the matrix
    item_playlist_matrix[playlist_index][track_index] = 1
    

print(item_playlist_matrix)


[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [60]:
#Now we have a item_playlist_matrix! Nice. lets save. 

sparse_matrix = sps.csr_matrix(item_playlist_matrix)

#sps.save_npz("sparse_item_playlist", sparse_matrix)

print(sparse_matrix.shape)



(23618, 41756)


In [61]:
#If we multiply the matrix with its transposition, we will get an item similarity matrix. 


playlist_similarities = sparse_matrix.dot(sparse_matrix.transpose())
print(playlist_similarities.shape)


(23618, 23618)


In [97]:
#Here we need to be able to get the similarities between two playlists. 
print(playlist_similarities.shape)
    
#playlist_similarities.getrow(1).todense(out = a)


NameError: name 'playlists_similarities' is not defined

In [78]:
#Get all playlists which contain a certain track:
playlists = (item_playlist_matrix[item_playlist_matrix.T[1666][:]==1])

(14, 41756)
