In [None]:
# . env/bin/activate
import json
import os
import numpy as np
import sklearn
import sklearn.metrics
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# import networkx as nx
import datetime
import playlist
reload(playlist)


In [None]:

playlist_list = playlist.get_playlist_list()


In [None]:

len(playlist_list)

In [None]:
def get_context_list(tracks_in_playlist, j, context_size=2):
    context_track_list = tracks_in_playlist[max(0, j-context_size):j]
    context_track_list += tracks_in_playlist[j+1:j+context_size+1]
    return context_track_list


context_mapping = {}
for i in range(len(playlist_list)):
    tracks_in_playlist = get_tracks(playlist_list[i])
    tracks_in_playlist = list(tracks_in_playlist)
    for j in range(len(tracks_in_playlist)):
        for context in get_context_list(tracks_in_playlist, j):
            if context in context_mapping.keys():
                context_mapping[context] += 1
            else:
                context_mapping[context] = 1
with open("context-mapping.txt", "w") as text_file:
    for context, count in context_mapping.items():
        text_file.write("%s %d" % (context, count))
        text_file.write("\n")

In [None]:
track_mapping = {}
for i in range(len(playlist_list)):
    for item in get_tracks(playlist_list[i]):
        if item in track_mapping.keys():
            track_mapping[item] += 1
        else:
            track_mapping[item] = 1
with open("track-mapping.txt", "w") as text_file:
    for track_uri, count in track_mapping.items():
        text_file.write("%s %d" % (track_uri, count))
        text_file.write("\n")

In [None]:

with open("training-data.txt", "w") as text_file:
    for i in range(len(playlist_list)):
        tracks_in_playlist = get_tracks(playlist_list[i])
        tracks_in_playlist = list(tracks_in_playlist)
        for j in range(len(tracks_in_playlist)):
            for k in range(len(get_context_list(tracks_in_playlist, j))):
                item = tracks_in_playlist[j] + ' ' + get_context_list(tracks_in_playlist, j)[k]
                text_file.write("%s " % item)
                text_file.write("\n")

In [None]:
!cd yoavgo-word2vecf-0d8e19d2f2c6; make
!cd yoavgo-word2vecf-0d8e19d2f2c6; ./word2vecf -train ../training-data.txt -wvocab ../track-mapping.txt -cvocab ../context-mapping.txt -output ../word2vecf-features.txt -size 200 -negative 15 -threads 4

In [None]:
track_df = pd.read_csv("word2vecf-features.txt", skiprows=1, header=None, sep=' ')
track_df = track_df.set_index(0)
track_df = track_df.drop(columns=[track_df.shape[1]])
track_df.columns

In [None]:
track_df.shape

In [None]:
track_df.head()

In [None]:
# playlist_list[0]
mean_embedding = track_df.mean()
mean_embedding.shape

In [None]:
track_array = track_df.values
track_row_names = dict(zip(track_df.index, range(track_array.shape[0])))

count_missing_embeddings = 0

def get_playlist_df(playlist_json):
    global count_missing_embeddings
    track_uri_list = playlist.get_tracks(playlist_json)
    playlist_dict = {}
    for track_uri in track_uri_list:
        if track_uri not in track_row_names:
            count_missing_embeddings += 1
            track_vec = mean_embedding
        else:
            track_vec = track_array[track_row_names[track_uri], :]
        playlist_dict[track_uri] = track_vec
    return pd.DataFrame(playlist_dict).T

In [None]:
from tqdm import tqdm_notebook as tqdm

In [None]:
from collections import defaultdict
def make_track_playlists(playlist_list):
    track_playlists = defaultdict(set)
    for playlist_index, playlist_json in enumerate(playlist_list):
        for track_uri in playlist.get_tracks(playlist_json):
            track_playlists[track_uri].add(playlist_index)
    return track_playlists

track_playlists = make_track_playlists(playlist_list)
count_missing_embeddings = 0
playlist_df_list = [get_playlist_df(playlist_json) for index, playlist_json in enumerate(tqdm(playlist_list))]
print('num missing embedding:', count_missing_embeddings)

In [None]:
training_playlist = playlist_df_list

In [None]:
len(training_playlist[0])

In [None]:
# fit in pandas needs a 2 dim-matrix, with rows being sample and columns being features
# training_playlist is a list of 2-dim-matrix, so if the rows can be on top of eachother
# the 2-dim matrix is build. solution: concat
# track_df = pd.concat(training_playlist).drop_duplicates()
# track_array = track_df.to_matrix()

In [None]:
def make_mediod_features(mediod, investigated_playlist):
    mediod_features = investigated_playlist.iloc[mediod]
    return mediod_features.values

def make_query_mediod(investigated_playlist, index_to_remove):
    query = investigated_playlist.drop(investigated_playlist.index[index_to_remove])
    query_distance_matrix = sklearn.metrics.pairwise.cosine_distances(query)
    mediod = np.argmin(query_distance_matrix.sum(axis = 0))
    mediod_features = make_mediod_features(mediod, investigated_playlist)
    return mediod_features


In [None]:
from sklearn.neighbors import NearestNeighbors
from tqdm import tnrange
import random

def cal_knn(k, track_array, track_df, tracks_dict):

    nbrs = NearestNeighbors(n_neighbors=k, metric='cosine', n_jobs=1).fit(track_array)
    all_true_ids = []
    all_distances = []
    all_suggested_ids = []

    for i in tnrange(len(playlist_list)):
        tracks = playlist_df_list[i].index
        if len(tracks) < 5:
            continue
        dropped_track_index = random.choice(range(len(tracks)))
        true_track_uri = tracks[dropped_track_index]
        true_id = tracks_dict[true_track_uri]
        all_true_ids.append(true_id)

        query = make_query_mediod(playlist_df_list[i], dropped_track_index)
        distances, suggested_index_array = nbrs.kneighbors(query.reshape(1,-1))
        suggested_ids = [tracks_dict[track_df.index[index]]
                         for index in suggested_index_array.flatten()]
        all_distances.append(distances)
        all_suggested_ids.append(suggested_ids)
    return all_true_ids, all_distances, all_suggested_ids

In [None]:
tracks_dict, tracks_id_dict = playlist.make_tracks_dict(playlist_list)
my_track_df = track_df.loc[tracks_dict.keys()]
my_track_array = my_track_df.values

In [None]:
all_true_ids, all_distances, all_suggested_ids = cal_knn(500, my_track_array, my_track_df, tracks_dict)

In [None]:
results = playlist.cal_results(playlist_list, all_true_ids, all_suggested_ids)
filename = 'results%d_W2Vf.txt' % (len(playlist_list),)
with open(filename, 'w') as output:
    output.write(str(results))