In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
import time
%matplotlib inline

#train_final.csv - the training set of interactions
train_final = pd.read_csv('input/train_final.csv', delimiter = "\t");

#tracks_final.csv - supplementary information about the items
tracks_final = pd.read_csv('input/tracks_final.csv', delimiter = "\t");

#playlists_final.csv - supplementary information about the users
playlists_final = pd.read_csv('input/playlists_final.csv', delimiter = "\t");

#target_playlists.csv - the set of target playlists that will receive recommendations
target_playlists = pd.read_csv('input/target_playlists.csv');

#target_tracks.csv - the set of target items (tracks) to be recommended
target_tracks = pd.read_csv('input/target_tracks.csv');
print("Loaded")

Loaded


In [32]:
albums = {}
for album in tracks_final['album']:
    album = album.strip('[ ]')
    if album != None and album != "None" and len(album) > 0: #None should not be considered content
        albums[album] = 1

print(len(albums))


27604


In [33]:
tracks_final['tags'].head()

content_to_index = {}
content_to_id = {}
content_counter = 0

#Lets translate the tags to indexes.
for row in tracks_final['tags']:
    tags = row.strip('[ ]').split(', ')
    for tag in tags:
        if len(tag) > 0: 
            tag = "ta"+tag
            if not(tag in content_to_index):
                content_to_index[tag] = content_counter
                content_to_id[content_counter] = tag
                content_counter += 1;

#Lets translate album into indexes
albumcount = 0 # 27604
for album in tracks_final['album']:
    album = album.strip('[ ]')
    if album != None and album != "None" and len(album) > 0: #None should not be considered content
        album = "al"+album
        if album == "alNone":
            print(album)
        if not(album in content_to_index):
            content_to_index[album] = content_counter
            content_to_id[content_counter] = album
            content_counter += 1
            albumcount += 1
#Lets translate artist_id into indexes 
artistcount = 0 #17537
for artist in tracks_final['artist_id']:
    artist = str(artist)
    if artist != None and artist != "None" and len(artist) > 0: #None should not be considered content
        artist = "ar"+artist
        if not(artist in content_to_index):
            content_to_index[artist] = content_counter
            content_to_id[content_counter] = artist
            content_counter += 1
            artistcount += 1
            
print(len(content_to_index))
print("%s albums. 27604 expected." %albumcount)
print("%s artists. 17537 expected." %artistcount)

77040
27604 albums. 27604 expected.
17536 artists. 17537 expected.


In [34]:
#If we translate each track_id to a track_index which will serve as matrix index, we can save a lot of time. 


#We need a way to get from track_id to index in O(1).
#Let's create a dictionary

track_to_id = {}
track_to_index = {}
track_ids = tracks_final['track_id']

counter = 0;
for track_id in tracks_final['track_id']:
    track_id = int(track_id)
    track_to_index[track_id] = counter
    track_to_id[counter] = track_id
    counter += 1;
    
#and a way to get from playlist_id to index in O(1)


playlist_to_index = {}
playlist_to_id = {}
counter = 0; 
for playlist_id in playlists_final['playlist_id']:
    playlist_id = int(playlist_id)
    playlist_to_index[playlist_id] = counter
    playlist_to_id[counter] = playlist_id
    counter += 1;
    
print("We have {} playlists with {} unique tracks with {} unique content types. ".format(len(playlist_to_index), len(track_to_index), len(content_to_index)))

We have 57561 playlists with 100000 unique tracks with 77040 unique content types. 


In [35]:
#So let's fill the ICM with our data.
import math

def build_ICM():
    
    no_interactions = train_final.shape[0]
    
    tracks_matrix = tracks_final.as_matrix()
    rows = np.zeros((no_interactions,), dtype = int)
    cols = np.zeros((no_interactions,), dtype = int)
    val = np.zeros((no_interactions,), dtype = int)
    #val[i] = value of row[i] col[i]
    #val = []
    counter = 0
    starttime = time.time()
    lasttime = starttime
    trackno = 0
    addedalbums = {} #for testing
    addedartists = {} # for testing
    for track in tracks_matrix: 
        track_id, artist_id, duration, playcount, album, tags = np.split(track, 6)

        #Get track index
        track_index = track_to_index[int(track_id[0])]

        
        #add artist
        
        artist_index = content_to_index["ar"+str(artist_id[0])]
        addedartists[artist_index] = 1
        
        rows[counter] = track_index
        cols[counter] = artist_index
        val[counter] = 1
        counter += 1

        #add album
        album = album[0].strip("[ ]")

        if album != None and len(album) > 0 and not album == "None":
            album_index = content_to_index["al"+album]
            addedalbums[album_index] = 1 #testing
            
            rows[counter] = track_index
            cols[counter] = album_index
            val[counter] = 1
            counter += 1

        #add tags
        tags = tags[0].strip('[ ]').split(', ')

        for tag in tags: 
            if len(tag) > 0:
                tag = "ta"+tag
                tag_index = content_to_index[tag]

                rows[counter] = track_index
                cols[counter] = tag_index
                val[counter] = 1
                
                counter+=1
                
        if trackno%5000 == 0:
            print("Track %s of %s. %s s sec." %(trackno, tracks_matrix.shape[0], round(time.time()-starttime, 2)))  
        trackno += 1
    
    rows = rows[:counter]
    cols = cols[:counter]
    val = val[:counter]
    
    print(rows[counter:])
    
    print(cols[counter:])
    
    print(val[counter:])
    #val = np.ones(rows.shape, dtype = int)

    #Build ICM matrix. 
    ICM_all = sps.coo_matrix((val, (rows, cols)), dtype = int)
    
    print("Built ICM matrix with %s content values." %(val.shape[0]))
    
    print("%s albums. 27604 expected." %len(addedalbums))
    print("%s artists. 17537 expected." %len(addedartists))
    
    return ICM_all


#Build new ICM
ICM_all = build_ICM()
print("Done!")

Track 0 of 100000. 0.0 s sec.
Track 5000 of 100000. 0.11 s sec.
Track 10000 of 100000. 0.21 s sec.
Track 15000 of 100000. 0.33 s sec.
Track 20000 of 100000. 0.45 s sec.
Track 25000 of 100000. 0.57 s sec.
Track 30000 of 100000. 0.68 s sec.
Track 35000 of 100000. 0.8 s sec.
Track 40000 of 100000. 0.91 s sec.
Track 45000 of 100000. 1.02 s sec.
Track 50000 of 100000. 1.14 s sec.
Track 55000 of 100000. 1.25 s sec.
Track 60000 of 100000. 1.37 s sec.
Track 65000 of 100000. 1.48 s sec.
Track 70000 of 100000. 1.6 s sec.
Track 75000 of 100000. 1.71 s sec.
Track 80000 of 100000. 1.82 s sec.
Track 85000 of 100000. 1.94 s sec.
Track 90000 of 100000. 2.05 s sec.
Track 95000 of 100000. 2.17 s sec.
[]
[]
[]
Built ICM matrix with 656745 content values.
27604 albums. 27604 expected.
17536 artists. 17537 expected.
Done!


In [36]:
def get_target_item_filter(indices):
    target_filter = np.zeros((indices), dtype = bool)
    for track in target_tracks.values:
        track_id = track[0]
        track_index = track_to_index[track_id]
        target_filter[track_index] = True
    print("Created filter preserving %s out of %s " %(np.count_nonzero(target_filter),target_filter.shape[0]))
    return target_filter

In [1]:
def split_URM(k = 5): 
    
    playlistList = train_final['playlist_id'].values
    itemList = train_final['track_id'].values
    
    #Translate ids
    playlistList_translated = np.zeros(playlistList.shape)
    itemList_translated = np.zeros(itemList.shape)
    ratingList = np.ones((playlistList.shape), int)
    for i in range(train_final.shape[0]):
        playlistList_translated[i] = playlist_to_index[playlistList[i]]
        itemList_translated[i] = track_to_index[itemList[i]]
    
    ## Build URM_full. 
    URM_full = sps.coo_matrix((ratingList, (playlistList_translated, itemList_translated)))
    URM_full = URM_full.tocsr()
    
    ## Build URM_train & URM_test as zeros.
    URM_train = URM_full.copy()
    URM_test = sps.csr_matrix(np.zeros(URM_full.shape, dtype = int))

    # If the data should be splitted. 
    if k> 0:
        ## for each pl
        for i, row in enumerate(URM_full): 
            ## get indexes of tracks
            
            ## randomly remove k tracks
            indices = row.nonzero()[0]
            for j in range(k): 
                removed_index = int(np.floor(np.random.rand()*indices.shape[0]))
                removed_track = indices[removed_index]
                indices = np.delete(indices,removed_index) #Deletes the int on index removed_index
                
                #Removes the track from the row
                URM_train[i,removed_track] = 0
                URM_test[i,removed_track] = 1
        
                
    return URM_train, URM_test

In [2]:
URM_full, URM_test = split_URM(5)
print(URM_full)

NameError: name 'train_final' is not defined