In [None]:
#imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import datetime
import threading
import logging
import time
import itertools
import smtplib
from email.mime.text import MIMEText
from collections import Counter
import csv
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import os
import spotify_utils as spu
from operator import itemgetter
from os.path import join as pj

from sklearn.linear_model import LinearRegression,LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,f1_score

from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler,TomekLinks
from imblearn.pipeline import Pipeline


from math import sqrt

In [None]:
# constants
REFRESH_CACHE = True
seen_playlist_id = "5al4jEBoq01LPmFDuGDnq4"      # Automated: Reviewed Items
played_playlist_id = "7EHT9D4ygqDlyGfqcFvkUv"    # 5 Esh Played
inbox_playlist_id = "1xsuqA0HU4bSosdaPyVlWG"     # 1 Esh Review


In [None]:
if REFRESH_CACHE:
    sp = spu.spotify_connect()
    seen_tracks = spu.get_playlist_tracks(sp,seen_playlist_id,audio_features=True)
    played_tracks = spu.get_playlist_tracks(sp,played_playlist_id)
    review_tracks = spu.get_playlist_tracks(sp,inbox_playlist_id,audio_features=True)
    seen_tracks.to_csv('seen_tracks.csv',index=False)
    played_tracks.to_csv('played_tracks.csv',index=False)
    review_tracks.to_csv('review_tracks.csv',index=False)

In [None]:
seen_tracks = pd.read_csv('seen_tracks.csv')
played_tracks = pd.read_csv('played_tracks.csv')["id"]
seen_tracks = seen_tracks.merge(played_tracks,how='left',on=["id"],indicator="played")
seen_tracks["played"] = seen_tracks["played"].apply(lambda i: 1 if i=="both" else 0)
seen_tracks = seen_tracks.dropna()

In [None]:
import duckdb
import pandas as pd
import sqlalchemy
# No need to import duckdb_engine
#  SQLAlchemy will auto-detect the driver needed based on your connection string!

# Import ipython-sql Jupyter extension to create SQL cells
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
%sql duckdb:///:memory:
%sql SELECT 'Off and flying!' as a_duckdb_column

In [None]:
%sql t0 << select * from seen_tracks
%sql select count(*) from t0

In [None]:
%%sql t1 << 
select * 
from (
    select *,row_number() over (partition by artist_id,track_name order by playlist_offset) as duplicate_index
    from t0
)
where duplicate_index = 1

In [None]:
t1["played"].value_counts()

In [None]:
%%sql t2 <<
select * from (
    select *,max(played) over (partition by artist_id) as artist_played from t1
)
where played = 1 or artist_played = 0

In [None]:
t2["played"].value_counts()

In [None]:
%%sql t3 <<
select *,coalesce(sum(played) over (
    partition by artist_id
    order by playlist_offset
    rows between unbounded preceding and 1 preceding 
    ),0) as previous_artist_plays
from t2

In [None]:
t3["previous_artist_plays"].value_counts()

In [None]:

X = t3[["duration","previous_artist_plays"] + spu.audio_features_to_use]
y = t3["played"]

In [None]:

param_grid = [
    {#'classifier': [RandomForestClassifier(random_state=0)],
     #'classifier__max_features': [1, int(sqrt(len(spu.audio_features_to_use)+1)), len(spu.audio_features_to_use)+1],
     #'classifier__n_estimators': [1, 10, 100],
     #'classifier__min_samples_split': [2, 5, 10]
     #'oversampler': [RandomOverSampler(random_state=0),SMOTE()],
     #'oversampler__sampling_strategy': [0.1,0.3,0.5],
     #'undersampler': [RandomUnderSampler(random_state=0)],
     #'undersampler__sampling_strategy': [0.5,0.7,0.9]
    }#,
    #{#'classifier': [RandomForestClassifier(random_state=0)],
    # #'classifier__max_features': [1, int(sqrt(len(spu.audio_features_to_use)+1)), len(spu.audio_features_to_use)+1],
    # #'classifier__n_estimators': [1, 10, 100],
    # #'classifier__min_samples_split': [2, 5, 10],
    # 'oversampler':[RandomOverSampler(random_state=0),SMOTE()],
    # 'oversampler__sampling_strategy': [0.1,0.3,0.5],
    # 'undersampler': [TomekLinks(sampling_strategy="majority")]
    #}
    ]
pipe = Pipeline([('classifier', RandomForestClassifier(random_state=0,n_estimators=10))], memory="cache_folder")
grid_search = GridSearchCV(pipe,param_grid,cv=5,scoring="f1_macro",verbose=3)

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.25,random_state=0)
grid_search.fit(X_train,y_train)

print("Best params:\n{}\n".format(grid_search.best_params_))
print("Best cross-validation f1_macro score: {:.2f}".format(grid_search.best_score_))

chosen_model = grid_search

In [None]:
y_true = y_test
y_pred = chosen_model.predict(X_test)
print(classification_report(y_true,y_pred))

In [None]:
review_tracks = pd.read_csv("review_tracks.csv").dropna()
%sql review_tracks2 << select *,coalesce(sum(played) over (partition by artist_id order by playlist_offset rows between unbounded preceding and 1 preceding),0) as previous_artist_plays from review_tracks
X_pred = review_tracks2[["duration","previous_artist_plays"] + spu.audio_features_to_use]
y_predict_proba = pd.DataFrame(chosen_model.predict_proba(X_pred),columns=["prediction","probability"])
review_tracks2["prediction"] = y_predict_proba["prediction"]
review_tracks2["probability"] = y_predict_proba["probability"]
res = review_tracks2.sort_values("probability",ascending=False)
res["uri"].to_csv("review_ranked_output.csv",index=False)
res