In [None]:
#imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import datetime
import threading
import logging
import time
import itertools
import smtplib
from email.mime.text import MIMEText
from collections import Counter
import csv
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import os
import spotify_utils as spu
from operator import itemgetter
from os.path import join as pj

from sklearn.linear_model import LinearRegression,LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,f1_score

from imblearn.over_sampling import RandomOverSampler


from math import sqrt

import logging
logging.basicConfig(level=logging.INFO, file='experiments.log')
logger = logging.getLogger(__name__)

In [None]:
# constants
REFRESH_CACHE = False
seen_playlist_id = "5al4jEBoq01LPmFDuGDnq4"      # Automated: Reviewed Items
played_playlist_id = "7EHT9D4ygqDlyGfqcFvkUv"    # 5 Esh Played
inbox_playlist_id = "1xsuqA0HU4bSosdaPyVlWG"     # 1 Esh Review


In [None]:
if REFRESH_CACHE:
    sp = spu.spotify_connect()
    seen_tracks = spu.get_playlist_tracks(sp,seen_playlist_id,audio_features=True)
    played_tracks = spu.get_playlist_tracks(sp,played_playlist_id)
    review_tracks = spu.get_playlist_tracks(sp,inbox_playlist_id)
    seen_tracks.to_csv('seen_tracks.csv',index=False)
    played_tracks.to_csv('played_tracks.csv',index=False)
    review_tracks.to_csv('review_tracks.csv',index=False)

In [None]:
seen_tracks = pd.read_csv('seen_tracks.csv')
played_tracks = pd.read_csv('played_tracks.csv')["id"]
seen_tracks = seen_tracks.merge(played_tracks,how='left',on=["id"],indicator="played")
seen_tracks["played"] = seen_tracks["played"].apply(lambda i: 1 if i=="both" else 0)
seen_tracks = seen_tracks.dropna()
X = seen_tracks[spu.audio_features_to_use]
y = seen_tracks["played"]

In [None]:
# Generic model evaluator
def evaluate_model(model,splitter,transformer,X,y):
    X_train, X_test, y_train, y_test = splitter[1](X,y,random_state=0)
    X_train, y_train = transformer[1](X_train,y_train)
    training_start = time.time()
    model[1].fit(X_train, y_train)
    training_end = time.time()
    training_duration = training_end - training_start
    y_test_pred = model[1].predict(X_test)
    name = f'{model[0]},{splitter[0]},{transformer[0]}'
    accuracy = model[1].score(X_test,y_test)
    f1_macro = f1_score(y_test,y_test_pred,average="macro")
    logger.info(f'{name},{accuracy},{f1_macro}')
    return (accuracy,f1_macro)
def stratified_split(X,y,random_state):
    return train_test_split(X,y,stratify=y,test_size=0.25,random_state=random_state)

In [None]:
models = [("Dummy",DummyClassifier(random_state=0,strategy="constant",constant=0)),
          ("KNN",KNeighborsClassifier()),
          ("Logistic Regression",LogisticRegression()),
          ("Decision tree",DecisionTreeClassifier(random_state=0)),
          ("Random forests",RandomForestClassifier(random_state=0))
         ]
spliiters = [("RandSplit",train_test_split),
             ("StratSplit",stratified_split)
            ]
transformers = [("Identity",lambda X,y: (X,y)),
                ("Over-sample",RandomOverSampler(random_state=0).fit_resample)
               ]
results = []
for transformer in transformers:
    for splitter in spliiters:
        for model in models:
            (evaluate_model(model,splitter,transformer,X,y)
print('\n'.join([','.join(map(str,res)) for res in results]))