# Support vector machine
Tentativo di fare la previsione con una support vector machine

In [176]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from matplotlib import pyplot as plt
from statistics import mean
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
#pipelines for scaling data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

#setting up labels for dataset
labels = ['class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy']
#importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)
#display(df)
#separating y from x
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

In [177]:
#declaring classifier for 'one versus one' case
clf = svm.SVC(decision_function_shape='ovo')
print(type(clf))

<class 'sklearn.svm._classes.SVC'>


In [178]:
#fitting classifier
clf.fit(X, y)

## Static test

In [179]:
#dividing the dataset for a static test
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 2:16], df.iloc[:, 0], test_size=0.3) # 70% training and 30% test

Nota: se vuoi ottenere sempre lo stesso split (ad esempio per fare delle prove), aggiungi il parametro "random_state = num" dove num è un intero costante
Nota: to_frame() è un metodo delle variabili di tipo "Series" per convertirle al tipo DataFrame. La "train_test_split", infatti, mi restituisce DataFrame per le X e Series per le Y, visto che quest'ultime sono colonne singole. Le funzioni successive non funzionano con parametri di tipo Series, quindi è necessario aggiungere to_frame(). Altre info qua: https://stackoverflow.com/questions/26047209/what-is-the-difference-between-a-pandas-series-and-a-single-column-dataframe 

In [180]:
scaled_clf = make_pipeline(StandardScaler(), svm.SVC(kernel='linear'))
scaled_clf.fit(X_train, y_train)

stat_clf = svm.SVC(kernel='linear')    #remember that the default kernel is rbf. 
#Try changing kernel between: {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} 
print(type(y_train))
stat_clf.fit(X_train, y_train)

<class 'pandas.core.series.Series'>


In [181]:
#let's make a prediction on the test data with both classifiers
y_pred = stat_clf.predict(X_test)
y_pred_scaled = scaled_clf.predict(X_test)
y_pred - y_pred_scaled

array([  0,   0,  15,   0,  14,  14,   0,  -7, -20,  16,   0,   0,   4,
       -21,  23,   0,  28,  20,  28,   0,   0,   0,   0,   0,   0,   0,
         0,  -2,  30,   0,   0,   0,   0,   0,   0,   0,  14,  30,  11,
       -15,   0,   0,  28, -21,   0,   0,   6,   0,   0,   0,   0, -16,
         0,   0, -21,   0,   0,  -2,  14,   5,   6,   0, -20,  -6,  16,
         0,  17,  14,   0,   0,   0,   0,   0,   5, -21,   0,   1,   0,
        -2,   0,  30,   9,  20,  28,   0,   0,   0,   0, -25,   0, -20,
         6,  28,   0,  -1,  -5,  16,   0,  16,  14,   0,  -7])

Possiamo vedere che le previsioni sono diverse. Proviamo a confrontare i parametri di stat_clf e scaled_clf, per vedere se con lo scaling cambiano solo i dati o anche i classificatori.

### Secondo me
stat_clf e what_is_this sono probabilmente lo stesso classifier SVM, perchè hanno sia lo stesso "type" che le stesse previsioni nel test set.

### Valutiamo il nostro modello statico

In [182]:
print("Accuracy clf:", metrics.accuracy_score(y_test, y_pred))
print("Accuracy scaled_clf:", metrics.accuracy_score(y_test, y_pred_scaled))

Accuracy clf: 0.4411764705882353
Accuracy scaled_clf: 0.7941176470588235


## Cross validation
Facciamo learning tramite cross validation su un terzo modello "clf_cv"

In [183]:
#print(metrics.get_scorer_names())

In [184]:
clf_cv = svm.SVC(decision_function_shape='ovo')
k = 5
effect_cv = cross_validate(clf_cv, X, y, cv=k, scoring=('accuracy'))
effect_cv
print("Mean test_score:", mean(effect_cv['test_score']))

{'fit_time': array([0.00744104, 0.00763512, 0.00631475, 0.00648713, 0.00655746]),
 'score_time': array([0.00231051, 0.00226092, 0.00267792, 0.00200844, 0.00266194]),
 'test_score': array([0.29411765, 0.30882353, 0.27941176, 0.27941176, 0.29411765])}

Mean test_score: 0.2911764705882353


In [185]:
#pipelines for scaling data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

pipe = Pipeline([('scaling', StandardScaler()),
                 ('SVM', svm.SVC(decision_function_shape='ovo'))])

# building the range of the regularization parameter (C)
exp = np.arange(-10, 12)
reg_param = 10.**exp

grid_param = {'SVM__C': reg_param,
              'SVM__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
              'SVM__degree': np.arange(2, 5),
              'SVM__decision_function_shape': ['ovo', 'ovr']}

clf_cv = GridSearchCV(pipe, grid_param, verbose = 4 ,scoring='balanced_accuracy', n_jobs=2)
clf_cv.fit(X,y)

predictions = clf_cv.predict(X_test)
print("Accuracy on predictions: ", accuracy_score(y_test, predictions))
print("Best params: ", clf_cv.best_params_)
print("Accuracy scores: ", clf_cv.best_score_)

Fitting 5 folds for each of 528 candidates, totalling 2640 fits
[CV 2/5] END SVM__C=1e-10, SVM__decision_function_shape=ovo, SVM__degree=2, SVM__kernel=linear;, score=0.033 total time=   0.0s
[CV 1/5] END SVM__C=1e-10, SVM__decision_function_shape=ovo, SVM__degree=2, SVM__kernel=linear;, score=0.033 total time=   0.0s
[CV 3/5] END SVM__C=1e-10, SVM__decision_function_shape=ovo, SVM__degree=2, SVM__kernel=linear;, score=0.033 total time=   0.0s
[CV 4/5] END SVM__C=1e-10, SVM__decision_function_shape=ovo, SVM__degree=2, SVM__kernel=linear;, score=0.033 total time=   0.0s
[CV 5/5] END SVM__C=1e-10, SVM__decision_function_shape=ovo, SVM__degree=2, SVM__kernel=linear;, score=0.033 total time=   0.0s
[CV 2/5] END SVM__C=1e-10, SVM__decision_function_shape=ovo, SVM__degree=2, SVM__kernel=poly;, score=0.033 total time=   0.0s
[CV 1/5] END SVM__C=1e-10, SVM__decision_function_shape=ovo, SVM__degree=2, SVM__kernel=poly;, score=0.033 total time=   0.0s
[CV 3/5] END SVM__C=1e-10, SVM__decision_fun

Accuracy on predictions:  1.0
Best params:  {'SVM__C': 100.0, 'SVM__decision_function_shape': 'ovo', 'SVM__degree': 2, 'SVM__kernel': 'linear'}
Accuracy scores:  0.7988888888888888


**Io penso che nel blocco precedente manchi la parte in cui lo scaler viene fittato sulla k-fold**