## 2. Classifier Training

### 2.0 Introduction
In this notebook we have the code nessecary in order to train three different classifiers, as well as displaing metrics and graphs about them. The data we use is created by the notebook "1DataProcess".

In [None]:
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt
import glob
import os
import pandas as pd
import cv2
from skimage.filters import sobel
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import joblib
import urllib.request
import validators

#### 2.1 Metric Functions

In this section we define several functions used for the creation of different accuracy measuring graphs, lice roc curves, confusion matrix, presicion-recall etc. There is also at the end a method used for plotting the pca reduced feature matrix and for evaluating models.

In [None]:
# Roc Curve Display Function
def createRocCurve(y_scores, y_onehot):

    np.random.seed(0)

    # Create an empty figure, and iteratively add new lines
    # every time we compute a new class
    fig = go.Figure()
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )

    for i in range(y_scores.shape[1]):
        y_true = y_onehot.iloc[:, i]
        y_score = y_scores[:, i]

        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc_score = roc_auc_score(y_true, y_score)

        name = f"{y_onehot.columns[i]} (AUC={auc_score:.2f})"
        fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

    fig.update_layout(
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'),
        width=700, height=500
    )
    
    return fig


### Confusion Matric function
def conf_m(cm, classes_dict):
    z = cm

    # invert z idx values
    z = z[::-1]

    #x = ['healthy', 'multiple diseases', 'rust', 'scab']
    x = classes_dict
    y =  x[::-1].copy() # invert idx values of x

    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in z]

    # set up figure 
    fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

    # add title
    #fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
    #                  #xaxis = dict(title='x'),
    #                  #yaxis = dict(title='x')
    #                 )

    # add custom xaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=0.5,
                            y=-0.15,
                            showarrow=False,
                            text="Predicted value",
                            xref="paper",
                            yref="paper"))

    # add custom yaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=-0.35,
                            y=0.5,
                            showarrow=False,
                            text="Real value",
                            textangle=-90,
                            xref="paper",
                            yref="paper"))

    # adjust margins to make room for yaxis title
    fig.update_layout(
        margin=dict(t=50, l=200),
        width=700, height=500
        )

    # add colorbar
    fig['data'][0]['showscale'] = True
    #fig.show()
    return fig


# Precission REcall Curves
def pr_rec_curve(y_onehot, y_scores):

    np.random.seed(0)

    # Create an empty figure, and iteratively add new lines
    # every time we compute a new class
    fig = go.Figure()
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=1, y1=0
    )

    for i in range(y_scores.shape[1]):
        y_true = y_onehot.iloc[:, i]
        y_score = y_scores[:, i]

        precision, recall, _ = precision_recall_curve(y_true, y_score)
        auc_score = average_precision_score(y_true, y_score)

        name = f"{y_onehot.columns[i]} (AP={auc_score:.2f})"
        fig.add_trace(go.Scatter(x=recall, y=precision, name=name, mode='lines'))

    fig.update_layout(
        xaxis_title='Recall',
        yaxis_title='Precision',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'),
        width=700, height=500
    )
    
    return fig

def plot_pca(feature_matrix_pca):
    fig = px.scatter(feature_matrix_pca, x="pca_1", y="pca_2", color="painter",
                 hover_data=['painter'])
    return fig

### evaluate model function, prints accuracy and error
def evaluate_model(model, test_labels):
    start = time.time()
    prediction = model.predict(X_test)
    stop = time.time() 
    print(f"Total inference time: {round(stop - start, 2)}s")
    print(f"Inference time per example: {round((stop - start)/len(y_test),5)}s")
    print(f"Test Set Accuracy : {accuracy_score(y_test, prediction) * 100} %\n\n")
    
    return accuracy_score(y_test, prediction) * 100


# This Function Calls Many of the above  ****** THis should be used for displaying ALL relevant graphs and ,metrics
def display_metrics(model, X_test, y_test):
    prediction = model.predict(X_test)
    y_scores = model.predict_proba(X_test)
    y_onehot = pd.get_dummies(y_test, columns=model.classes_)
    y_onehot = pd.get_dummies(y_test, columns=model.classes_)
    pred_score = round(accuracy_score(y_test, prediction) * 100.0, 2)
    cm = confusion_matrix(y_test, prediction, labels=model.classes_)
    
    print(classification_report(y_test, prediction))
    #print(type(classification_report(y_test, prediction)))

    conf_matrix = conf_m(cm, list(model.classes_))
    conf_matrix.show()
    roc_figure = createRocCurve(y_scores, y_onehot)
    roc_figure.show()
    prec_recall = pr_rec_curve(y_onehot, y_scores)
    prec_recall.show()

#### 2.2 Load Data & diplsay pca

In this section we load the feature matrix dataset as it was created in notebook "1DataProcess". We display the pca reduced graph.

In [None]:
### Load Feature Matrix from .csv file
feature_matrix = pd.read_csv('feature_matrix.csv')
feature_matrix_pca = pd.read_csv('feature_matrix_pca.csv')

In [None]:
# Display the pca reduced image of the dataset:
fig = px.scatter_3d(feature_matrix_pca, x='pca_1', y='pca_2', z='pca_3',
              color='painter')
fig.show()

### 2.3 Data Splitting and Model Fitting
In this segment we start creating/fitting and optimizing different classifiers on our data.

1) Random Forest Classifier
    \- Random Forest Tuning
    \- Metrics Extraction
2) SVM
    \- SVM Tuning
    \- Metrics Extraction
3) KNN
    \- SVM Tuning
    \- Metrics Extraction

### 2.4 Random Forest

In [None]:
# feature_matrix = pd.read_csv('feature_matrix.csv')

X = feature_matrix.drop(columns = 'painter')
y = feature_matrix['painter']
X = X.values
y = y.values

#le = preprocessing.LabelEncoder()
#le.fit(y)
#y = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### 2.4.1 Basic Fit and Evaluation 

In [None]:
### Get a basic estimation of the basic random forest performance on our dataset

rfc = RandomForestClassifier()

start = time.time()
rfc.fit(X_train, y_train)
stop = time.time()
duration = round(stop - start, 2)
print(f"Total training time: {duration}s")
print(f"Training time per example: {round(duration/len(y_train),5)}s")

start = time.time()
prediction = rfc.predict(X_test)
stop = time.time()
print(f"Total inference time: {round(stop - start, 2)}s")
print(f"Inference time per example: {round((stop - start)/len(y_test),5)}s")

print(f"Test Set Accuracy : {accuracy_score(y_test, prediction) * 100} %\n\n")

### 2.4.2 RF Optimization

In [None]:
### Create a new random forest classifier and this time perform tuning

rfc = RandomForestClassifier()

# Parameters used for seaerching in the tuning process
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 3)]
max_features = ['sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 3)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# Perform tuning
rf_random = RandomizedSearchCV(estimator = rfc, scoring='accuracy', param_distributions = random_grid, n_iter = 30, cv = 5, 
                               verbose=2, random_state=42, n_jobs = 1)
rf_random.fit(X_train, y_train)

In [None]:
# get the best classifier

rf_random.best_params_
best_random = rf_random.best_estimator_
print(rf_random.best_params_)

# save the best classifier to a file for future use
joblib.dump(best_random, 'best_rf_clf.joblib')

### 2.4.3 Compare and Visualize

In [None]:
#load best classifier and get performance metrics

best_random = joblib.load('best_rf_clf.joblib')
print(best_random.get_params())
base_model = RandomForestClassifier()
base_model.fit(X_train, y_train)

# compare best model with the base model
print('Optimized model accuracy: ')
best_score = evaluate_model(best_random, X_test)
print('Base model accuracy: ')
base_score = evaluate_model(base_model, X_test)

print('Accuracy gain: ')
print(best_score - base_score)

In [None]:
# Display ALL performance related graphs (cm matrix, pr-rec curve, roc etc)
display_metrics(best_random, X_test, y_test)

### 2.5  SVM 

In [None]:
# feature_matrix = pd.read_csv('feature_matrix.csv')

X = feature_matrix.drop(columns = 'painter')
y = feature_matrix['painter']
X = X.values
y = y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### 2.5.1 Basic Fit and Evaluation 

In [None]:
### Get a basic estimation of the basic SVM performance on our dataset

svc = SVC()

start = time.time()
rfc.fit(X_train, y_train)
stop = time.time()
duration = round(stop - start, 2)
print(f"Total training time: {duration}s")
print(f"Training time per example: {round(duration/len(y_train),5)}s")

start = time.time()
prediction = rfc.predict(X_test)
stop = time.time()
print(f"Total inference time: {round(stop - start, 2)}s")
print(f"Inference time per example: {round((stop - start)/len(y_test),5)}s")

print(f"Test Set Accuracy : {accuracy_score(y_test, prediction) * 100} %\n\n")

### 2.5.2 SVM Optimization 

In [None]:
### Create a new SVM classifier and this time perform tuning
svc = SVC()

rand_list = {'C': [1, 10, 100], 
              'gamma': [1, 0.1, 'scale'],
              'kernel': ['rbf']}
              
rand_search = RandomizedSearchCV(svc, param_distributions = rand_list, n_iter = 9, cv = 3, scoring='accuracy', verbose = 10, n_jobs = 1) 
rand_search.fit(X_train, y_train) 

In [None]:
# get the best classifier

rand_search.best_params_
best_svm = rand_search.best_estimator_
print(rand_search.best_params_)

# save the best classifier to a file for future use
joblib.dump(best_svm, 'best_svm_clf.joblib')

### 2.5.3 Compare and Visualize

In [None]:
#load best classifier


#load best classifier and get performance metrics

best_svm = joblib.load('best_svm_clf.joblib')
print(best_svm.get_params())
base_model = SVC()
base_model.fit(X_train, y_train)

# compare best model with the base model
print('Optimized model accuracy: ')
best_score = evaluate_model(best_svm, X_test)
print('Base model accuracy: ')
base_score = evaluate_model(base_model, X_test)

print('Accuracy gain: ')
print(best_score - base_score)

In [None]:
# Display ALL performance related graphs (cm matrix, pr-rec curve, roc etc)
display_metrics(best_svm, X_test, y_test)

### 2.6 KNN 

In [None]:
feature_matrix = pd.read_csv('feature_matrix.csv')

X = feature_matrix.drop(columns = 'painter')
y = feature_matrix['painter']
X = X.values
y = y.values

x_train, x_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state=0)  
   
st_x= StandardScaler()    
x_train= st_x.fit_transform(x_train)    
x_test= st_x.transform(x_test)

### 2.6.1 Basic Fit and Evaluation

In [None]:
### Get a basic estimation of the basic KNN Classifier performance on our dataset
clf = KNeighborsClassifier()  

start = time.time()
clf.fit(X_train, y_train)
stop = time.time()
duration = round(stop - start, 2)
print(f"Total training time: {duration}s")
print(f"Training time per example: {round(duration/len(y_train),5)}s")

start = time.time()
prediction = clf.predict(X_test)
stop = time.time()
print(f"Total inference time: {round(stop - start, 2)}s")
print(f"Inference time per example: {round((stop - start)/len(y_test),5)}s")

print(f"Test Set Accuracy : {accuracy_score(y_test, prediction) * 100} %\n\n")

### 2.6.2 KNN Optimization 

In [None]:
### Create a new random forest classifier and this time perform tuning

clf = KNeighborsClassifier()  

#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2] #Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

# Search for best
random_search = RandomizedSearchCV(clf, hyperparameters, cv=10, verbose = 10, scoring='accuracy', n_jobs = 1 )#Fit the model
random_search.fit(X_train,y_train)

In [None]:
# get the best classifier

random_search.best_params_
best_knn = rand_search.best_estimator_
print(rand_search.best_params_)

# Save the best classifier to a file
joblib.dump(best_knn, 'best_knn_clf.joblib')

### 2.5.3 Compare and Visualize

In [None]:
#load best classifier


#load best classifier and get performance metrics

best_knn = joblib.load('best_knn_clf.joblib')
print(best_knn.get_params())
base_model = KNeighborsClassifier()  
base_model.fit(X_train, y_train)

# compare best model with the base model
print('Optimized model accuracy: ')
best_score = evaluate_model(best_knn, X_test)
print('Base model accuracy: ')
base_score = evaluate_model(base_model, X_test)

print('Accuracy gain: ')
print(best_score - base_score)

In [None]:
# Display ALL performance related graphs (cm matrix, pr-rec curve, roc etc)
display_metrics(best_knn, X_test, y_test)