# Support vector machine
Tentativo di fare la previsione con una support vector machine

In [1]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from matplotlib import pyplot as plt
from statistics import mean
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
#preprocessing
from sklearn import preprocessing 

#setting up labels for dataset
labels = ['class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy']
#importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)
#display(df)
#separating y from x
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

In [2]:
#declaring classifier for 'one versus one' case
clf = svm.SVC(decision_function_shape='ovo')
print(type(clf))

<class 'sklearn.svm._classes.SVC'>


In [3]:
#fitting classifier
clf.fit(X, y)

In [4]:
#Possiamo vedere i support vector se vogliamo
clf.support_vectors_

array([[7.2694e-01, 1.4742e+00, 3.2396e-01, ..., 5.2323e-03, 2.7477e-04,
        1.1756e+00],
       [7.4173e-01, 1.5257e+00, 3.6116e-01, ..., 2.7080e-03, 7.4846e-05,
        6.9659e-01],
       [7.6722e-01, 1.5725e+00, 3.8998e-01, ..., 9.2068e-04, 3.7886e-05,
        4.4348e-01],
       ...,
       [3.5344e-01, 1.0329e+00, 7.8147e-01, ..., 1.6123e-02, 4.5288e-04,
        1.6935e+00],
       [5.9988e-01, 1.1427e+00, 7.1532e-01, ..., 6.1900e-03, 2.6454e-04,
        1.1526e+00],
       [4.7195e-01, 1.0901e+00, 8.5409e-01, ..., 1.3487e-02, 3.2855e-04,
        1.5623e+00]])

## Static test e scaling

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 2:16], df.iloc[:, 0], test_size=0.3) # 70% training and 30% test

scaler_xtr = preprocessing.StandardScaler().fit(X_train)
Xtr_scaled = scaler_xtr.transform(X_train)

scaler_xte = preprocessing.StandardScaler().fit(X_test)
Xte_scaled = scaler_xte.transform(X_test)

#Xtr_scaled.mean(axis=0)
#Xtr_scaled.std(axis=0)
#Xte_scaled.mean(axis=0)
#Xte_scaled.std(axis=0)

Nota: se vuoi ottenere sempre lo stesso split (ad esempio per fare delle prove), aggiungi il parametro "random_state = num" dove num è un intero costante
Nota: to_frame() è un metodo delle variabili di tipo "Series" per convertirle al tipo DataFrame. La "train_test_split", infatti, mi restituisce DataFrame per le X e Series per le Y, visto che quest'ultime sono colonne singole. Le funzioni successive non funzionano con parametri di tipo Series, quindi è necessario aggiungere to_frame(). Altre info qua: https://stackoverflow.com/questions/26047209/what-is-the-difference-between-a-pandas-series-and-a-single-column-dataframe 

In [6]:
stat_clf = svm.SVC(kernel='linear')    #remember that the default kernel is rbf. 
#Try changing kernel between: {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} 
stat_clf.fit(X_train, y_train)

scaled_clf = svm.SVC(kernel='linear')
scaled_clf.fit(Xtr_scaled, y_train)

## Fase di prediction

In [7]:
#let's make a prediction on the test data
y_pred = stat_clf.predict(X_test)
y_pred2 = scaled_clf.predict(Xte_scaled)
y_pred - y_pred2

array([  9,   0,  30, -10,   2,   0,  14,   0,   0,  -2,   0, -14,   0,
         0,   0,   0,   0,   0,   0,  -2,   0,   0, -23, -21, -14,   0,
         0,  23,  31,  10,   0,   0,   0,   0,   0, -22,   0,   0,   2,
         6,   0, -15,   0,   0,  -2,   2,   0, -26,  28,  -3,   0, -21,
         0,   5,  25,  18,   0,   0,  14, -21,   0, -21,   6, -21,   0,
       -14,   0,   3,   0,   0,   7,   5,  -5,  25,   0,   0,   0,   0,
         0,  28,   0,  -2,  28,  25,  -2,   0,  -1,  30,  -2,  -2,  25,
         0,   0, -21,   0,   0, -14, -20,  20,  15,   0, -14])

### Valutiamo il nostro modello statico

In [11]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Accuracy with scaling: ", metrics.accuracy_score(y_test, y_pred2))

Accuracy: 0.4215686274509804
Accuracy with scaling:  0.7941176470588235


## Cross validation
Facciamo learning tramite cross validation su un terzo modello "clf_cv".

In [13]:
#print(metrics.get_scorer_names())

<class 'sklearn.svm._classes.SVC'>
<class 'sklearn.svm._classes.SVC'>


In [10]:
clf_cv = svm.SVC(decision_function_shape='ovo')
k = 5
effect_cv = cross_validate(clf_cv, X, y, cv=k, scoring=('accuracy'))
effect_cv
print("Mean test_score:", mean(effect_cv['test_score']))

{'fit_time': array([0.00713038, 0.00707889, 0.00613594, 0.00682688, 0.00631928]),
 'score_time': array([0.00275135, 0.00216913, 0.00234079, 0.00220132, 0.00268078]),
 'test_score': array([0.29411765, 0.30882353, 0.27941176, 0.27941176, 0.29411765])}

Mean test_score: 0.2911764705882353


Proviamo anche a modificare scaled_clf con la CV. Per ogni fold della CV bisogna ricordarsi di riscalare i dati.

In [None]:
scaled_effect_cv = cross_validate(scaled_clf, X, y, cv=k, scoring=('accuracy'))