## Support Vector Machine

### Import Data

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 1000)

# Import data to Pandas DataFrame
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_final = pd.read_csv('data/test_final.csv')

# Split X & y for train, test, and test_final
X_train = train.drop('target', axis=1)
y_train = train['target']

X_test = test.drop('target', axis=1)
y_test = test['target']

X_test_final = test_final.drop('target', axis=1)
y_test_final = test_final['target']

# Preivew data
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13988 entries, 0 to 13987
Data columns (total 28 columns):
acousticness          13988 non-null float64
danceability          13988 non-null float64
duration_ms           13988 non-null float64
energy                13988 non-null float64
instrumentalness      13988 non-null float64
liveness              13988 non-null float64
loudness              13988 non-null float64
mode_feat             13988 non-null float64
speechiness           13988 non-null float64
tempo                 13988 non-null float64
valence               13988 non-null float64
time_signature_1.0    13988 non-null int64
time_signature_3.0    13988 non-null int64
time_signature_4.0    13988 non-null int64
time_signature_5.0    13988 non-null int64
key_0.0               13988 non-null int64
key_1.0               13988 non-null int64
key_2.0               13988 non-null int64
key_3.0               13988 non-null int64
key_4.0               13988 non-null int64
key_5.0  

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode_feat,speechiness,tempo,valence,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0,key_0.0,key_1.0,key_2.0,key_3.0,key_4.0,key_5.0,key_6.0,key_7.0,key_8.0,key_9.0,key_10.0,key_11.0,target
0,0.62,0.465,146494.0,0.469,0.0,0.118,-4.256,1.0,0.0319,69.22,0.518,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,7
1,0.00598,0.489,211885.0,0.641,0.0201,0.122,-7.011,0.0,0.032,108.901,0.587,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,0.249,0.594,283733.0,0.635,0.0,0.454,-4.259,0.0,0.47,75.035,0.552,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,2
3,0.000667,0.674,191250.0,0.869,0.604,0.0651,-4.758,0.0,0.0417,127.802,0.071,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,7
4,0.0288,0.675,231133.0,0.76,2.2e-05,0.0585,-4.435,1.0,0.0513,109.619,0.611,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,4


### Scale Train & Test Data

In [2]:
# Standardize with MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_mmscaled = min_max_scaler.fit_transform(X_train)
X_test_mmscaled = min_max_scaler.transform(X_test)

### Fit SVM Model

In [54]:
from sklearn import svm
svm_clf = svm.SVC(probability=True, random_state=11)
svm_clf.fit(X_train_mmscaled, y_train)

print('Train Score:', svm_clf.score(X_train_mmscaled, y_train))
print('Test Score:', svm_clf.score(X_test_mmscaled, y_test))

Train Score: 0.4789104947097512
Test Score: 0.4792679439519588


### GridSearch to Optimize Parameters

In [40]:
from sklearn.model_selection import GridSearchCV

# Set grid search params
param_grid_svm = [
    #{'C': [12], 'kernel': ['linear']},
    #{'C': [10], 'gamma': [0.1], 'kernel': ['rbf']},
    {'coef0': [25], 'gamma': [0.03], 'degree': [3], 'kernel': ['poly']},
    #{'coef0': [0.01], 'gamma': [0.2], 'kernel': ['sigmoid']}
]

# Construct grid search
gs_svm = GridSearchCV(estimator=svm_clf,
           param_grid=param_grid_svm,
           scoring='accuracy',
           cv=3, verbose=2, return_train_score=True,
                     n_jobs=5)

# Fit using grid search
gs_svm.fit(X_train_mmscaled, y_train)
gs_svm.fit(X_test_mmscaled, y_test)

# Best accuracy
print('Best accuracy: %.3f' % gs_svm.best_score_)

# Best params
print('\nBest params:\n', gs_svm.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 out of   3 | elapsed:  1.1min finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 out of   3 | elapsed:    3.7s finished


Best accuracy: 0.508

Best params:
 {'coef0': 25, 'degree': 3, 'gamma': 0.03, 'kernel': 'poly'}


### Accuracy Score

In [41]:
pred_gs_svm_train = gs_svm.predict(X_train_mmscaled)
pred_gs_svm_test = gs_svm.predict(X_test_mmscaled)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y_train, pred_gs_svm_train)
print_metrics(y_test, pred_gs_svm_test)

Accuracy Score: 0.4929225050042894
Accuracy Score: 0.536745782098942


### Omit Key and Time Signature Features from X

In [3]:
# Split X & y for train, test, and test_final
# Omit columns related to time signature and 
X2_train = train.drop(['target', 'mode_feat', 'liveness', 'time_signature_1.0', 'time_signature_3.0', 'time_signature_4.0', 'time_signature_5.0', 'key_0.0', 'key_1.0', 'key_2.0', 'key_3.0', 'key_4.0', 'key_5.0', 'key_6.0', 'key_7.0', 'key_8.0', 'key_9.0', 'key_10.0', 'key_11.0'], axis=1)
y2_train = train['target']

X2_test = test.drop(['target', 'mode_feat', 'liveness', 'time_signature_1.0', 'time_signature_3.0', 'time_signature_4.0', 'time_signature_5.0', 'key_0.0', 'key_1.0', 'key_2.0', 'key_3.0', 'key_4.0', 'key_5.0', 'key_6.0', 'key_7.0', 'key_8.0', 'key_9.0', 'key_10.0', 'key_11.0'], axis=1)
y2_test = test['target']

X2_test_final = test_final.drop(['target', 'mode_feat', 'liveness', 'time_signature_1.0', 'time_signature_3.0', 'time_signature_4.0', 'time_signature_5.0', 'key_0.0', 'key_1.0', 'key_2.0', 'key_3.0', 'key_4.0', 'key_5.0', 'key_6.0', 'key_7.0', 'key_8.0', 'key_9.0', 'key_10.0', 'key_11.0'], axis=1)
y2_test_final = test_final['target']

# Preivew data
print(X2_train.info())
X2_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13988 entries, 0 to 13987
Data columns (total 9 columns):
acousticness        13988 non-null float64
danceability        13988 non-null float64
duration_ms         13988 non-null float64
energy              13988 non-null float64
instrumentalness    13988 non-null float64
loudness            13988 non-null float64
speechiness         13988 non-null float64
tempo               13988 non-null float64
valence             13988 non-null float64
dtypes: float64(9)
memory usage: 983.6 KB
None


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,loudness,speechiness,tempo,valence
0,0.62,0.465,146494.0,0.469,0.0,-4.256,0.0319,69.22,0.518
1,0.00598,0.489,211885.0,0.641,0.0201,-7.011,0.032,108.901,0.587
2,0.249,0.594,283733.0,0.635,0.0,-4.259,0.47,75.035,0.552
3,0.000667,0.674,191250.0,0.869,0.604,-4.758,0.0417,127.802,0.071
4,0.0288,0.675,231133.0,0.76,2.2e-05,-4.435,0.0513,109.619,0.611


### Scale Train & Test Data

In [4]:
# Standardize with MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X2_train_mmscaled = min_max_scaler.fit_transform(X2_train)
X2_test_mmscaled = min_max_scaler.transform(X2_test)

### Fit SVM Model

In [57]:
from sklearn import svm
svm_clf2 = svm.SVC(probability=True, random_state=11)
svm_clf2.fit(X2_train_mmscaled, y2_train)

print('Train Score:', svm_clf2.score(X2_train_mmscaled, y2_train))
print('Test Score:', svm_clf2.score(X2_test_mmscaled, y2_test))

Train Score: 0.4859879897054618
Test Score: 0.4872748069774092


### GridSearch to Optimize Parameters

In [47]:
from sklearn.model_selection import GridSearchCV

# Set grid search params
param_grid_svm2 = [
    #{'C': [12], 'kernel': ['linear']},
    #{'C': [10], 'gamma': [0.1], 'kernel': ['rbf']},
    {'coef0': [25], 'gamma': [0.03], 'degree': [3], 'kernel': ['poly']},
    #{'coef0': [0.01], 'gamma': [0.2], 'kernel': ['sigmoid']}
]

# Construct grid search
gs_svm2 = GridSearchCV(estimator=svm_clf2,
           param_grid=param_grid_svm2,
           scoring='accuracy',
           cv=3, verbose=2, return_train_score=True,
                     n_jobs=5)

# Fit using grid search
gs_svm2.fit(X2_train_mmscaled, y2_train)
gs_svm2.fit(X2_test_mmscaled, y2_test)

# Best accuracy
print('Best accuracy: %.3f' % gs_svm2.best_score_)

# Best params
print('\nBest params:\n', gs_svm2.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  10 out of  12 | elapsed:  1.1min remaining:   13.3s
[Parallel(n_jobs=5)]: Done  12 out of  12 | elapsed:  1.4min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  10 out of  12 | elapsed:    3.8s remaining:    0.8s
[Parallel(n_jobs=5)]: Done  12 out of  12 | elapsed:    4.9s finished


Best accuracy: 0.492

Best params:
 {'coef0': 25, 'degree': 3, 'gamma': 0.03, 'kernel': 'poly'}


### Accuracy Score

In [48]:
pred_gs_svm2_train = gs_svm2.predict(X2_train_mmscaled)
pred_gs_svm2_test = gs_svm2.predict(X2_test_mmscaled)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y2_train, pred_gs_svm2_train)
print_metrics(y2_test, pred_gs_svm2_test)

Accuracy Score: 0.49306548470117245
Accuracy Score: 0.5018587360594795


### PCA

In [5]:
from sklearn.decomposition import PCA
def find_PCA_components(X,n_start=2,target=0.8,skip=1):
    curr_target = 0
    n = n_start
    while curr_target < target:
        pca = PCA(n_components=n)
        transformed = pca.fit_transform(X)
        curr_target = np.sum(pca.explained_variance_ratio_)
        n += skip
    print(f"n_component={n}, variance ={curr_target}")
find_PCA_components(X_train_mmscaled,n_start=2,target=0.9,skip=1)

n_component=16, variance =0.912107928212611


In [7]:
pca = PCA(n_components=16)
X_train_transformed = pca.fit_transform(X_train_mmscaled)
X_test_transformed = pca.fit_transform(X_test_mmscaled)

In [8]:
from sklearn import svm
svm_clf3 = svm.SVC(probability=True, random_state=11)
svm_clf3.fit(X_train_transformed, y_train)

print('Train Score:', svm_clf3.score(X_train_transformed, y_train))
print('Test Score:', svm_clf3.score(X_test_transformed, y_test))

Train Score: 0.37782384901344007
Test Score: 0.3228481555619102


In [9]:
from sklearn.model_selection import GridSearchCV

# Set grid search params
param_grid_svm3 = [
    #{'C': [12], 'kernel': ['linear']},
    #{'C': [10], 'gamma': [0.1], 'kernel': ['rbf']},
    {'coef0': [25], 'gamma': [0.03], 'degree': [3], 'kernel': ['poly']},
    #{'coef0': [0.01], 'gamma': [0.2], 'kernel': ['sigmoid']}
]

# Construct grid search
gs_svm3 = GridSearchCV(estimator=svm_clf3,
           param_grid=param_grid_svm3,
           scoring='accuracy',
           cv=3, verbose=2, return_train_score=True,
                     n_jobs=5)

# Fit using grid search
gs_svm3.fit(X_train_transformed, y_train)
gs_svm3.fit(X_test_transformed, y_test)

# Best accuracy
print('Best accuracy: %.3f' % gs_svm3.best_score_)

# Best params
print('\nBest params:\n', gs_svm3.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 out of   3 | elapsed:   46.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 out of   3 | elapsed:    4.1s finished


Best accuracy: 0.427

Best params:
 {'coef0': 25, 'degree': 3, 'gamma': 0.03, 'kernel': 'poly'}


In [11]:
pred_gs_svm3_train = gs_svm3.predict(X_train_transformed)
pred_gs_svm3_test = gs_svm3.predict(X_test_transformed)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y_train, pred_gs_svm3_train)
print_metrics(y_test, pred_gs_svm3_test)

Accuracy Score: 0.32506434086359737
Accuracy Score: 0.4501000857878181


### PCA -features

In [6]:
from sklearn.decomposition import PCA
def find_PCA_components(X,n_start=2,target=0.8,skip=1):
    curr_target = 0
    n = n_start
    while curr_target < target:
        pca = PCA(n_components=n)
        transformed = pca.fit_transform(X)
        curr_target = np.sum(pca.explained_variance_ratio_)
        n += skip
    print(f"n_component={n}, variance ={curr_target}")
find_PCA_components(X2_train_mmscaled,n_start=2,target=0.8,skip=1)

n_component=5, variance =0.8310453663005558


In [12]:
pca = PCA(n_components=5)
X2_train_transformed = pca.fit_transform(X2_train_mmscaled)
X2_test_transformed = pca.fit_transform(X2_test_mmscaled)

In [13]:
from sklearn import svm
svm_clf4 = svm.SVC(probability=True, random_state=11)
svm_clf4.fit(X2_train_transformed, y2_train)

print('Train Score:', svm_clf4.score(X2_train_transformed, y2_train))
print('Test Score:', svm_clf4.score(X2_test_transformed, y2_test))

Train Score: 0.4047040320274521
Test Score: 0.27194738347154707


In [14]:
from sklearn.model_selection import GridSearchCV

# Set grid search params
param_grid_svm4 = [
    #{'C': [12], 'kernel': ['linear']},
    #{'C': [10], 'gamma': [0.1], 'kernel': ['rbf']},
    {'coef0': [25], 'gamma': [0.03], 'degree': [3], 'kernel': ['poly']},
    #{'coef0': [0.01], 'gamma': [0.2], 'kernel': ['sigmoid']}
]

# Construct grid search
gs_svm4 = GridSearchCV(estimator=svm_clf4,
           param_grid=param_grid_svm4,
           scoring='accuracy',
           cv=3, verbose=2, return_train_score=True,
                     n_jobs=5)

# Fit using grid search
gs_svm4.fit(X2_train_transformed, y2_train)
gs_svm4.fit(X2_test_transformed, y2_test)

# Best accuracy
print('Best accuracy: %.3f' % gs_svm4.best_score_)

# Best params
print('\nBest params:\n', gs_svm4.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 out of   3 | elapsed:   31.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 out of   3 | elapsed:    1.5s finished


Best accuracy: 0.432

Best params:
 {'coef0': 25, 'degree': 3, 'gamma': 0.03, 'kernel': 'poly'}


In [15]:
pred_gs_svm4_train = gs_svm4.predict(X2_train_transformed)
pred_gs_svm4_test = gs_svm4.predict(X2_test_transformed)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y2_train, pred_gs_svm4_train)
print_metrics(y2_test, pred_gs_svm4_test)

Accuracy Score: 0.25421790105804976
Accuracy Score: 0.43408635973691734
