In [1]:
import numpy as np
import pandas as pd
import os

os.chdir("..")
data = np.loadtxt(('data/preprocessed-dataset.csv'), delimiter=',', skiprows=1)
os.chdir("methods")
x = data[:,1:35]
y = data[:,35]

#print(x, y)

In [2]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.2)

#print(x_test, y_test)

In [3]:
from sklearn import svm

model_linear = svm.SVC(kernel='linear')
model_linear.fit(x_train, y_train)

linear_pred = model_linear.predict(x_test)

In [4]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

def model_info(model_pred):
    print("Accuracy:", metrics.accuracy_score(y_test, model_pred))
    print("Precision:", metrics.precision_score(y_test, model_pred))
    print("Recall:", metrics.recall_score(y_test, model_pred), end="\n\n")
    print(classification_report(y_test, model_pred))

model_info(linear_pred)

Accuracy: 0.9050966608084359
Precision: 0.8913043478260869
Recall: 0.9379084967320261

              precision    recall  f1-score   support

         0.0       0.92      0.87      0.89       263
         1.0       0.89      0.94      0.91       306

    accuracy                           0.91       569
   macro avg       0.91      0.90      0.90       569
weighted avg       0.91      0.91      0.90       569



In [5]:
# training the model with the RBF kernel
model_rbf = svm.SVC(kernel='rbf')
model_rbf.fit(x_train, y_train)

rbf_pred = model_rbf.predict(x_test)

model_info(rbf_pred)

Accuracy: 0.8804920913884007
Precision: 0.8695652173913043
Recall: 0.9150326797385621

              precision    recall  f1-score   support

         0.0       0.89      0.84      0.87       263
         1.0       0.87      0.92      0.89       306

    accuracy                           0.88       569
   macro avg       0.88      0.88      0.88       569
weighted avg       0.88      0.88      0.88       569



In [6]:
# TODO
#feature importance https://stackoverflow.com/questions/41592661/determining-the-most-contributing-features-for-svm-classifier-in-sklearn
#scaling https://stats.stackexchange.com/questions/60514/data-processing-before-applying-svm
#hyperparameter tuning https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/
#model tuning https://stackoverflow.com/questions/39001936/techniques-to-improve-the-accuracy-of-svm-classifier
#model tuning https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python

In [7]:
from sklearn.model_selection import GridSearchCV

# Applying grid search for the linear model
# This block is reall resource heavy as we have a quite big grid to search
parameters = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'rbf']} 

model_grid = GridSearchCV(svm.SVC(), parameters, refit = True, verbose = 3)
model_grid.fit(x_train, y_train)
grid_pred = model_grid.predict(x_test)

model_info(grid_pred)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.927 total time=   0.2s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.901 total time=   0.1s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.892 total time=   0.1s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.894 total time=   0.1s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.896 total time=   0.1s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.510 total time=   0.5s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.510 total time=   0.6s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.510 total time=   0.5s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.509 total time=   0.6s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.509 total time=   0.5s
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.927 total time=   0.1s
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear

In [None]:
'''
NOTES on 

The C parameter tells the SVM optimization how much you want to avoid misclassifying each training example.
For large values of C, the optimization will choose a smaller-margin hyperplane if that hyperplane does a better job of 
getting all the training points classified correctly. Conversely, a very small value of C will cause the optimizer to look
for a larger-margin separating hyperplane, even if that hyperplane misclassifies more points. For very tiny values of C,
you should get misclassified examples, often even if your training data is linearly separable.

The gamma parameter defines how far the influence of a single training example reaches, with low values meaning 'far' 
and high values meaning 'close'. The gamma parameters can be seen as the inverse of the radius of influence of samples
selected by the model as support vectors.

'''

In [None]:
'''
NOTES on scaling

The main advantage of scaling is to avoid attributes in greater numeric
ranges dominating those in smaller numeric ranges. Another advantage is to avoid
numerical difficulties during the calculation. Because kernel values usually depend on
the inner products of feature vectors, e.g. the linear kernel and the polynomial kernel, 
large attribute values might cause numerical problems.
'''

In [None]:
model_grid.best_params_

In [None]:
# See if scaling has any benefit:
from sklearn.preprocessing import MinMaxScaler

#by default MinMaxScaler scale the features between 0 and 1
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)
#print(x_train_scaled, y_train)

In [None]:
# refit both the linear model and the rfb mode 

model_linear_scale = svm.SVC(kernel='linear')
model_linear_scale.fit(x_train_scaled, y_train)

model_rbf_scale = svm.SVC(kernel='rbf')
model_rbf_scale.fit(x_train_scaled, y_train)

#linear_scale_pred = model_linear_scale(x_test_scaled)
rbf_scale_pred = model_rbf_scale.predict(x_test_scaled)

#model_info(linear_scale_pred)
model_info(rbf_scale_pred)

Accuracy: 0.8980667838312829
Precision: 0.8734939759036144
Recall: 0.9477124183006536

              precision    recall  f1-score   support

         0.0       0.93      0.84      0.88       263
         1.0       0.87      0.95      0.91       306

    accuracy                           0.90       569
   macro avg       0.90      0.89      0.90       569
weighted avg       0.90      0.90      0.90       569

