In [1]:
import datetime
import pickle
import sys
import time

import pandas as pd
import numpy as np

sys.path.append('../')
sys.path.append('../Models')

import wrmf_helpers
from feature_extraction import compute_df
from spotify_api_database import SpotifyAuth
from settings import PATH_TO_SPARSE_MATRIX

In [18]:
train_playlists = []
for i in range(1, 10):
    filename = f'/Users/mabasta/Desktop/CS109a/playlist-generation/data/Songs/songs{i}.csv'
    cols_ignore = ['pos', 'artist_name', 'artist_uri', 'track_name', 'album_uri', 'duration_ms', 'album_name']
    df = pd.read_csv(filename).drop(columns = cols_ignore)
    train_playlists.extend([group.track_uri for _, group in df.groupby('pid')])

val_playlists = []
for i in range(10, 12):
    filename = f'/Users/mabasta/Desktop/CS109a/playlist-generation/data/Songs/songs{i}.csv'
    cols_ignore = ['pos', 'artist_name', 'artist_uri', 'track_name', 'album_uri', 'duration_ms', 'album_name']
    df = pd.read_csv(filename).drop(columns = cols_ignore)
    val_playlists.extend([group.track_uri for _, group in df.groupby('pid')])

_, tid_to_idx, _, _, _ = wrmf_helpers.get_user_item_sparse_matrix(PATH_TO_SPARSE_MATRIX)    

In [3]:
with open('../Models/wrmf_factors.pickle', 'rb') as f:
    playlist_factors, song_factors = pickle.load(f)

print(song_factors.shape)
print(playlist_factors.shape)

(2029866, 20)
(800000, 20)


In [7]:
train_dfs = []
start_time = time.time()
n = 10
for i in range(n):
    if len(train_playlists[i]) > 2:
        train_dfs.append(compute_df(train_playlists[i], song_factors, playlist_factors))
    current_time = time.time()
    elapsed_time = current_time - start_time
    time_left = n * elapsed_time / float(i+1) - elapsed_time
    sys.stdout.write(
        f"\rParsed playlist {i+1}/{n}. Time elapsed: {str(datetime.timedelta(seconds=elapsed_time))[:-7]}. Time remaining: {str(datetime.timedelta(seconds=time_left))[:-7]}")
    sys.stdout.flush()

train_df = pd.concat(train_dfs)

Parsed playlist 10/10. Time elapsed: 0:02:56. Time remaining: :00:17

In [13]:
with open('../data/stage_2_train.pickle', 'wb') as f:
    pickle.dump(train_df , f)

In [10]:
import xgboost as xgb

In [15]:
ytrain = train_df['relevence']
Xtrain = train_df.drop('relevence', axis=1)

In [20]:
from sklearn.metrics import accuracy_score
model = xgb.XGBClassifier(n_estimators=150, max_depth=10, learning_rate=0.1, subsample=0.5)
model.fit(Xtrain, ytrain)
pred_train = model.predict(Xtrain)

In [21]:
print("Accuracy for model 1: %.2f" % (accuracy_score(ytrain, pred_train) * 100))

Accuracy for model 1: 100.00


In [19]:
val_dfs = []
start_time = time.time()
n = 2
for i in range(n):
    if len(val_playlists[i]) > 2:
        val_dfs.append(compute_df(val_playlists[i], song_factors, playlist_factors))
    current_time = time.time()
    elapsed_time = current_time - start_time
    time_left = n * elapsed_time / float(i+1) - elapsed_time
    sys.stdout.write(
        f"\rParsed playlist {i+1}/{n}. Time elapsed: {str(datetime.timedelta(seconds=elapsed_time))[:-7]}. Time remaining: {str(datetime.timedelta(seconds=time_left))[:-7]}")
    sys.stdout.flush()

val_df = pd.concat(val_dfs)


Parsed playlist 2/2. Time elapsed: 0:00:21. Time remaining: 0:00:10

In [22]:
yval = val_df['relevence']
Xval = val_df.drop('relevence', axis=1)
pred_val = model.predict(Xval)

In [23]:
print("Accuracy for model 1: %.2f" % (accuracy_score(yval, pred_val) * 100))

Accuracy for model 1: 47.92
