# Support vector machine
Tentativo di fare la previsione con una support vector machine

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from matplotlib import pyplot as plt
from statistics import mean
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
#pipelines for scaling data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

#setting up labels for dataset
labels = ['class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy']
#importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)
#display(df)

#shuffle data
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]

#separating y from x
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

In [None]:
#declaring classifier for 'one versus one' case
clf = svm.SVC(decision_function_shape='ovo')
#fitting classifier
clf.fit(X, y)

## Static test

In [None]:
#dividing the dataset for a static test
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 2:16], df.iloc[:, 0], test_size=0.3) # 70% training and 30% test

In [None]:
### Fit scaled and non-scaled classifiers for the training data

In [None]:
from sklearn.pipeline import Pipeline
#non-scaled clf
stat_clf = svm.SVC(kernel='linear')    #remember that the default kernel is rbf. 
stat_clf.fit(X_train, y_train)
#scaled clf
scaled_clf = Pipeline([ ('scaling', StandardScaler()),
                        ('SVM', svm.SVC(kernel='linear', decision_function_shape='ovo'))])
scaled_clf.fit(X_train, y_train)

In [None]:
#let's make a prediction on the test data with both classifiers
y_pred = stat_clf.predict(X_test)
y_pred_scaled = scaled_clf.predict(X_test)
#y_pred - y_pred_scaled

Possiamo vedere che le previsioni sono diverse. Proviamo a confrontare i parametri di stat_clf e scaled_clf, per vedere se con lo scaling cambiano solo i dati o anche i classificatori.

### Valutiamo il nostro modello statico

In [None]:
print("Accuracy clf:", metrics.accuracy_score(y_test, y_pred))
print("Accuracy scaled_clf:", metrics.accuracy_score(y_test, y_pred_scaled))

#da tenere in considerazione per i warning sotto
print(y_test.sort_values().unique())
y_compare = y_test.to_numpy
y_pred.sort
print(np.unique(y_pred))

## Cross validation
Facciamo learning tramite cross validation su un terzo modello "clf_cv"

In [None]:
#per stampare tutti i nomi degli indici di valutazione
#print(metrics.get_scorer_names())

In [None]:
clf_cv = svm.SVC(decision_function_shape='ovo')
k = 5
effect_cv = cross_validate(clf_cv, X, y, cv=k, scoring=('accuracy'))
effect_cv
print("Mean test_score:", mean(effect_cv['test_score']))

In [None]:
#pipelines for scaling data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

pipe = Pipeline([('scaling', StandardScaler()),
                 ('SVM', svm.SVC(decision_function_shape='ovo'))])

# building the range of the regularization parameter (C)
exp = np.arange(-10, 12)
reg_param = 10.**exp

grid_param = {'SVM__C': reg_param,
              'SVM__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
              'SVM__degree': np.arange(2, 5),
              'SVM__decision_function_shape': ['ovo', 'ovr']}

clf_cv = GridSearchCV(pipe, grid_param ,scoring='balanced_accuracy', n_jobs=4)
clf_cv.fit(X_train,y_train)

predictions = clf_cv.predict(X_test)
print("Accuracy on predictions: ", accuracy_score(y_test, predictions))
print("Best params: ", clf_cv.best_params_)
print("Accuracy scores: ", clf_cv.best_score_)