In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

In [3]:
X_df = pd.read_csv('data/dataset/oversampling/dfu_features_dataset.csv', index_col=0)
y_df = pd.read_csv('data/dataset/oversampling/dfu_labels_dataset.csv', index_col=0)

In [None]:
def find_parameters(features_df: pd.DataFrame, n_features:int) -> pd.DataFrame:
    features = features_df.iloc[:n_features]['Features'].values
    X = X_df[features].to_numpy().astype(np.float32)
    y = y_df.to_numpy().ravel()

    param_grid = {'C': np.logspace(0, 2, 25), 'gamma': np.logspace(-3, 0, 25), 'degree':np.arange(3, 8, 1),
         'kernel': ['rbf', 'linear', 'poly'], 'probability': [True], 'random_state': [42]}

    grid = RandomizedSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5, n_jobs=-1, n_iter=500, random_state=42)
    grid.fit(X, y)

    return pd.DataFrame(grid.best_params_, index=[0])

# Lasso

In [None]:
features_df = pd.read_csv('data/features_importance/oversampling/lasso.csv')
n_parameters = [10, 25, 50]

for i in n_parameters:
    df = find_parameters(features_df, i)
    # save best params to csv file
    df.to_csv('data/best_params/svm_lasso_{}.csv'.format(i), index=False)

# Concrete Dropout

In [None]:
features_df = pd.read_csv('data/features_importance/oversampling/concrete_dropout.csv')

n_parameters = [10, 25, 50]

for i in n_parameters:
    df = find_parameters(features_df, i)
    # save best params to csv file
    df.to_csv('data/best_params/svm_concrete_{}.csv'.format(i), index=False)

n_parameters = [10, 25, 50]

# All features

In [None]:
X = X_df.to_numpy().astype(np.float32)
y = y_df.to_numpy().ravel()

param_grid = {'C': np.logspace(0, 2, 10), 'gamma': np.logspace(-3, 0, 12), 'degree':[3,4,5,6], 'kernel': ['rbf', 'linear', 'poly'],
        'probability': [True], 'random_state': [42]}

grid = RandomizedSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5, n_jobs=-1, n_iter=100, random_state=42)
grid.fit(X, y)

df = pd.DataFrame(grid.best_params_, index=[0])
df.to_csv('data/best_params/svm_all.csv', index=False)

# Testing

In [47]:
testing_results_save_dir = 'data/best_params/results/'

columns = pd.MultiIndex.from_tuples([('Accuracy', 'Mean'), ('Accuracy', 'Std'), 
                                     ('Precision', 'Mean'), ('Precision', 'Std'), 
                                     ('Recall', 'Mean'), ('Recall', 'Std'),
                                      ('F1-Score', 'Mean'), ('F1-Score', 'Std')])

metrics_df = pd.DataFrame(columns=columns)

In [33]:
def test(X, y, **svc_params):
    from sklearn.svm import SVC
    from sklearn.model_selection import KFold

    k_folds = 5
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)

    y_true = []
    y_predict = [] 

    for fold, (train_ids, test_ids) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_ids], X[test_ids]
        y_train, y_test = y[train_ids], y[test_ids]
        svc = SVC(**svc_params)
        svc.fit(X_train, y_train)

        y_true.append(y_test)
        y_predict.append(svc.predict(X_test))

        print(f'Acc. Fold {fold}: {svc.score(X_test, y_test)}')

    return y_true, y_predict

def compute_metrics(y_true: list, y_predict: list):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    acc, prec, rec, f1 = np.zeros(len(y_true)), np.zeros(len(y_true)), np.zeros(len(y_true)), np.zeros(len(y_true))

    for cv_idx in range(len(y_true)):
        acc[cv_idx] = accuracy_score(y_true[cv_idx], y_predict[cv_idx])
        prec[cv_idx] = precision_score(y_true[cv_idx], y_predict[cv_idx]) 
        rec[cv_idx] = recall_score(y_true[cv_idx], y_predict[cv_idx])
        f1[cv_idx] = f1_score(y_true[cv_idx], y_predict[cv_idx])

    return acc, prec, rec, f1

def plot_confusion_matrix(y_true, y_predict):
    from sklearn.metrics import ConfusionMatrixDisplay
    from matplotlib import pyplot as plt
    labels = ['CG', 'DM']
    cmp = ConfusionMatrixDisplay.from_predictions(y_true, y_predict, display_labels=labels, cmap='Blues', normalize='true')
    fig, ax = plt.subplots(figsize=(10,8))
    cmp.plot(ax=ax, cmap='Blues')
    # Increase labels size
    ax.tick_params(axis='both', which='major', labelsize=20)
    # Increase xlabel size
    ax.set_xlabel(ax.get_xlabel(), fontsize=18)
    ax.set_ylabel(ax.get_ylabel(), fontsize=20)
    #Increase values size
    for text in ax.texts:
        text.set_size(18)
    # Increase colorbar ticks size
    ax.figure.axes[-1].tick_params(labelsize=18)
    
    return fig

## All features

In [35]:
param = pd.read_csv('data/best_params/svm_all.csv').to_dict(orient='records')[0]
X = X_df.to_numpy().astype(np.float32)
y = y_df.to_numpy().ravel()

y_true, y_predict = test(X, y, **param)

Acc. Fold 0: 0.9183673469387755
Acc. Fold 1: 0.9795918367346939
Acc. Fold 2: 0.9387755102040817
Acc. Fold 3: 0.7959183673469388
Acc. Fold 4: 0.9166666666666666


In [48]:
acc, prec, rec, f1 = compute_metrics(y_true, y_predict)
metrics_df.loc['All Features'] = [acc.mean(), acc.std(), prec.mean(), prec.std(), rec.mean(), rec.std(), f1.mean(), f1.std()]

In [None]:
figure = plot_confusion_matrix(y_true, y_predict)
figure.savefig(os.path.join(testing_results_save_dir, 'imgs/All_Features.pdf'), bbox_inches='tight')

## LASSO
### 10 First Features

In [50]:
param = pd.read_csv('data/best_params/svm_lasso_10.csv').to_dict(orient='records')[0]
features_df = pd.read_csv('data/features_importance/oversampling/lasso.csv')
features = features_df.iloc[:10]['Features'].values

X = X_df[features].to_numpy().astype(np.float32)
y = y_df.to_numpy().ravel()

y_true, y_predict = test(X, y, **param)

acc, prec, rec, f1 = compute_metrics(y_true, y_predict)
metrics_df.loc['LASSO 10 Features'] = [acc.mean(), acc.std(), prec.mean(), prec.std(), rec.mean(), rec.std(), f1.mean(), f1.std()]

Acc. Fold 0: 0.9387755102040817
Acc. Fold 1: 0.9183673469387755
Acc. Fold 2: 0.8979591836734694
Acc. Fold 3: 0.8367346938775511
Acc. Fold 4: 0.8958333333333334


In [None]:
figure = plot_confusion_matrix(y_true, y_predict)
figure.savefig(os.path.join(testing_results_save_dir, 'imgs/LASSO_10_Features.pdf'), bbox_inches='tight')

### 25 First Features

In [52]:
param = pd.read_csv('data/best_params/svm_lasso_25.csv').to_dict(orient='records')[0]
features_df = pd.read_csv('data/features_importance/oversampling/lasso.csv')
features = features_df.iloc[:25]['Features'].values

X = X_df[features].to_numpy().astype(np.float32)
y = y_df.to_numpy().ravel()

y_true, y_predict = test(X, y, **param)

acc, prec, rec, f1 = compute_metrics(y_true, y_predict)
metrics_df.loc['LASSO 25 Features'] = [acc.mean(), acc.std(), prec.mean(), prec.std(), rec.mean(), rec.std(), f1.mean(), f1.std()]

Acc. Fold 0: 0.9591836734693877
Acc. Fold 1: 0.9591836734693877
Acc. Fold 2: 0.9387755102040817
Acc. Fold 3: 0.9183673469387755
Acc. Fold 4: 0.9583333333333334


In [None]:
figure = plot_confusion_matrix(y_true, y_predict)
figure.savefig(os.path.join(testing_results_save_dir, 'imgs/LASSO_25_Features.pdf'), bbox_inches='tight')

### 50 First Features

In [53]:
param = pd.read_csv('data/best_params/svm_lasso_50.csv').to_dict(orient='records')[0]
features_df = pd.read_csv('data/features_importance/oversampling/lasso.csv')
features = features_df.iloc[:50]['Features'].values

X = X_df[features].to_numpy().astype(np.float32)
y = y_df.to_numpy().ravel()

y_true, y_predict = test(X, y, **param)

acc, prec, rec, f1 = compute_metrics(y_true, y_predict)
metrics_df.loc['LASSO 50 Features'] = [acc.mean(), acc.std(), prec.mean(), prec.std(), rec.mean(), rec.std(), f1.mean(), f1.std()]

Acc. Fold 0: 0.9795918367346939
Acc. Fold 1: 0.9795918367346939
Acc. Fold 2: 0.9183673469387755
Acc. Fold 3: 0.7755102040816326
Acc. Fold 4: 0.9166666666666666


In [None]:
figure = plot_confusion_matrix(y_true, y_predict)
figure.savefig(os.path.join(testing_results_save_dir, 'imgs/LASSO_50_Features.pdf'), bbox_inches='tight')

## Concrete Dropout
### 10 First Features

In [54]:
param = pd.read_csv('data/best_params/svm_concrete_10.csv').to_dict(orient='records')[0]
features_df = pd.read_csv('data/features_importance/oversampling/concrete_dropout.csv')
features = features_df.iloc[:10]['Features'].values

X = X_df[features].to_numpy().astype(np.float32)
y = y_df.to_numpy().ravel()

y_true, y_predict = test(X, y, **param)

acc, prec, rec, f1 = compute_metrics(y_true, y_predict)
metrics_df.loc['Concrete 10 Features'] = [acc.mean(), acc.std(), prec.mean(), prec.std(), rec.mean(), rec.std(), f1.mean(), f1.std()]

Acc. Fold 0: 0.8979591836734694
Acc. Fold 1: 0.9387755102040817
Acc. Fold 2: 0.9183673469387755
Acc. Fold 3: 0.7755102040816326
Acc. Fold 4: 0.9166666666666666


In [None]:
figure = plot_confusion_matrix(y_true, y_predict)
figure.savefig(os.path.join(testing_results_save_dir, 'imgs/Concrete_10_Features.pdf'), bbox_inches='tight')

### 25 First Features

In [55]:
param = pd.read_csv('data/best_params/svm_concrete_25.csv').to_dict(orient='records')[0]
features_df = pd.read_csv('data/features_importance/oversampling/concrete_dropout.csv')
features = features_df.iloc[:25]['Features'].values

X = X_df[features].to_numpy().astype(np.float32)
y = y_df.to_numpy().ravel()

y_true, y_predict = test(X, y, **param)

acc, prec, rec, f1 = compute_metrics(y_true, y_predict)
metrics_df.loc['Concrete 25 Features'] = [acc.mean(), acc.std(), prec.mean(), prec.std(), rec.mean(), rec.std(), f1.mean(), f1.std()]

Acc. Fold 0: 0.8979591836734694
Acc. Fold 1: 1.0
Acc. Fold 2: 0.8979591836734694
Acc. Fold 3: 0.7551020408163265
Acc. Fold 4: 0.9166666666666666


In [None]:
figure = plot_confusion_matrix(y_true, y_predict)
figure.savefig(os.path.join(testing_results_save_dir, 'imgs/Concrete_25_Features.pdf'), bbox_inches='tight')

### 50 First Features

In [56]:
param = pd.read_csv('data/best_params/svm_concrete_50.csv').to_dict(orient='records')[0]
features_df = pd.read_csv('data/features_importance/oversampling/concrete_dropout.csv')
features = features_df.iloc[:50]['Features'].values

X = X_df[features].to_numpy().astype(np.float32)
y = y_df.to_numpy().ravel()

y_true, y_predict = test(X, y, **param)

acc, prec, rec, f1 = compute_metrics(y_true, y_predict)
metrics_df.loc['Concrete 50 Features'] = [acc.mean(), acc.std(), prec.mean(), prec.std(), rec.mean(), rec.std(), f1.mean(), f1.std()]

Acc. Fold 0: 0.8979591836734694
Acc. Fold 1: 0.9591836734693877
Acc. Fold 2: 0.8571428571428571
Acc. Fold 3: 0.7959183673469388
Acc. Fold 4: 0.875


In [None]:
figure = plot_confusion_matrix(y_true, y_predict)
figure.savefig(os.path.join(testing_results_save_dir, 'imgs/Concrete_50_Features.pdf'), bbox_inches='tight')

# Save results

In [58]:
metrics_df.to_csv(os.path.join(testing_results_save_dir, 'metrics.csv'))