In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from svm_helper import *

import warnings; warnings.simplefilter('ignore') 

In [2]:
x_cols = ["Is_Exp","Danceability","Energy","Key","Loudness","Mode","Speechiness","Acousticness","Instrumentalness","Liveness","Valence","Tempo","Time_Signature"]                                                                                           
y_cols = ["Genre"]                                                                                                       
meta_cols = ["Id","Popularity","Name","Artist"] 

p_metrics = ["accuracy", "f1_score", "auroc", "precision", "sensitivity", "specificity"]

In [3]:
df = pd.read_csv("data/lyrical_genius.csv")
df = df[((df["Genre"] == "classical") | (df["Genre"] ==  "country"))]

X,y,meta = df[x_cols],df[y_cols].iloc[:,0],df[meta_cols]

y_dict={"classical":0,"country":1}
y=np.array([y_dict[x] for x in y])

scaler   = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X,y, test_size=.2, stratify=y)

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=np.random.randint(1234))

In [None]:
lin_best_params = select_param_linear(X_train, y_train, skf, metrics=p_metrics,plot=True)
print("Best linear params: ", lin_best_params)


In [None]:
lin_svm = SVC(10, kernel='linear')
lin_svm.fit(X_train, y_train)

lin_performance = [performance(y_test, lin_svm.predict(X_test), metric=m, test=True) for m in p_metrics]

lin_performance

In [None]:
rbf_best_params = select_param_rbf(X_train, y_train, skf, p_metrics, plot=False)
print("Best rbf params: ", rbf_best_params)

In [12]:
rbf_svm = SVC(100, kernel='rbf', gamma=0.005, verbose=True)
rbf_svm.fit(X_train, y_train)

rbf_performance = [performance(y_test, rbf_svm.predict(X_test), metric=m, test=True) for m in p_metrics]

rbf_performance

[LibSVM]

[0.8535353535353535,
 0.9131736526946108,
 0.6936677631578947,
 0.8764367816091954,
 0.953125,
 0.4342105263157895]

In [6]:
def gen_df(genre_a, genre_b, df):
    new_df = df[((df["Genre"] == genre_a) | (df["Genre"] ==  genre_b))]
    X,y,meta = new_df[x_cols],new_df[y_cols].iloc[:,0],new_df[meta_cols]
    y_dict={genre_a:0, genre_b:1}
    y=np.array([y_dict[x] for x in y])
    
    scaler = StandardScaler()
    scaled_X = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X,y, test_size=.2, stratify=y)
    
    return X_train, X_test, y_train, y_test

In [7]:
df = pd.read_csv("data/lyrical_genius.csv")
genres = df["Genre"].unique()

# create a 2d table of best scores for each pair of genres
lin_best_C = [[-1 for j in genres] for i in genres]
rbf_best_CG = [[-1 for j in genres] for i in genres]


In [8]:
for i in range(len(genres)):
    for j in range(i+1, len(genres)):
        print("Optimizing ", genres[i], genres[j])
        X_train, X_test, y_train, y_test = gen_df(genres[i], genres[j], df)
        lin_best_params = select_param_linear(X_train, y_train, skf, metrics=p_metrics,plot=False)
        rbf_best_params = select_param_rbf(X_train, y_train, skf, p_metrics, plot=False)
        
        lin_best_C[i][j] = lin_best_params[0] # accuracy score
        rbf_best_CG[i][j] = rbf_best_params[0]
        

Optimizing  rnb hiphop
Optimizing  rnb country
Optimizing  rnb classical
Optimizing  rnb edm_dance
Optimizing  rnb rock
Optimizing  rnb pop
Optimizing  hiphop country
Optimizing  hiphop classical
Optimizing  hiphop edm_dance
Optimizing  hiphop rock
Optimizing  hiphop pop
Optimizing  country classical
Optimizing  country edm_dance
Optimizing  country rock
Optimizing  country pop
Optimizing  classical edm_dance
Optimizing  classical rock
Optimizing  classical pop
Optimizing  edm_dance rock
Optimizing  edm_dance pop
Optimizing  rock pop


In [11]:
from collections import Counter
print(Counter([i for j in lin_best_C for i in j]))
print(Counter([i for j in rbf_best_CG for i in j]))

Counter({-1: 28, 10.0: 7, 0.01: 6, 0.1: 3, 100.0: 2, 0.001: 2, 1.0: 1})
Counter({-1: 28, (10.0, 1.0): 2, (1.0, 1.0): 2, (100.0, 0.0017094017094017094): 1, (100.0, 0.0010214504596527069): 1, (100.0, 0.00040096230954290296): 1, (100.0, 0.0010683760683760685): 1, (100.0, 0.0017793594306049821): 1, (100.0, 0.0009587727708533077): 1, (10.0, 0.00039093041438623924): 1, (100.0, 0.001): 1, (100.0, 2.5518276189406855e-06): 1, (100.0, 0.0006234413965087282): 1, (100.0, 0.00033875338753387534): 1, (1.0, 0.0007173601147776184): 1, (1.0, 0.000980392156862745): 1, (100.0, 0.000500751126690035): 1, (0.1, 1.0): 1, (100.0, 0.0005115089514066496): 1, (100.0, 0.0006329113924050633): 1})


In [None]:
rbf_svm = SVC(1, kernel='rbf', gamma=0.4, verbose=True)
rbf_svm.fit(X_train, y_train)

In [None]:
df = pd.read_csv("")