In [2]:
from sklearn.model_selection import train_test_split
import numpy as np
import h5py
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Dataset importation

In [3]:
dir_output = "Output"
features_path = dir_output + "/features.h5"
labels_path = dir_output + "/labels.h5"
test_size = 0.3

# import features and labels
h5f_data = h5py.File(features_path, 'r')
h5f_label = h5py.File(labels_path, 'r')

features_string = h5f_data['dataset_skin_lesion']
labels_string = h5f_label['dataset_skin_lesion']

X = np.array(features_string)
Y = np.array(labels_string)

h5f_data.close()
h5f_label.close()

# SPLIT DATA INTO TRAINING AND TEST SETS
(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y,
                                                      test_size=test_size,
                                                      random_state=458773245,
                                                      stratify=Y)

print(np.unique(Y, return_counts=True))
print(np.unique(Y_train, return_counts=True))
print(np.unique(Y_test, return_counts=True))

#STANDARDIZE DATA
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

(array([0., 1.]), array([100, 100]))
(array([0., 1.]), array([70, 70]))
(array([0., 1.]), array([30, 30]))


# SVM

In [30]:
param_grid_svm = {
    "C": np.linspace(3,7,50),
    "kernel": ['linear', 'rbf', 'poly', 'sigmoid'],
    "gamma": ['scale', 'auto'],
}

In [31]:
svm = SVC()
grid_svm = GridSearchCV(svm, param_grid_svm, cv=3, scoring="accuracy")
grid_svm.fit(X_train, Y_train)
best_param = grid_svm.best_params_
best_svm = grid_svm.best_estimator_
y_pred_svm = best_svm.predict(X_test)

print(best_param)

{'C': np.float64(5.122448979591836), 'gamma': 'scale', 'kernel': 'rbf'}


## Evaluation

In [33]:
print(confusion_matrix(Y_test, y_pred_svm),
      classification_report(Y_test, y_pred_svm))

print(accuracy_score(Y_test, y_pred_svm))

[[23  7]
 [ 6 24]]               precision    recall  f1-score   support

         0.0       0.79      0.77      0.78        30
         1.0       0.77      0.80      0.79        30

    accuracy                           0.78        60
   macro avg       0.78      0.78      0.78        60
weighted avg       0.78      0.78      0.78        60

0.7833333333333333


# Decision tree

In [7]:
param_grid_tree = {
    "min_samples_split": np.linspace(10, 50, 20).astype(int),
    "min_samples_leaf": np.linspace(5, 10, 15).astype(int),
    "max_depth": [None], #[None, 5, 10, 20],
}

In [8]:
tree = DecisionTreeClassifier()
grid_tree = GridSearchCV(tree, param_grid_tree, cv=3, scoring="accuracy")
grid_tree.fit(X_train, Y_train)
best_tree = grid_tree.best_estimator_
best_param = grid_tree.best_params_
y_pred_tree = best_tree.predict(X_test)
print(best_param)

{'max_depth': None, 'min_samples_leaf': np.int64(7), 'min_samples_split': np.int64(12)}


## Evaluation

In [None]:
print(confusion_matrix(Y_test, y_pred_tree),
      classification_report(Y_test, y_pred_tree))

print(accuracy_score(Y_test, y_pred_tree))

[[11  4]
 [ 3 12]]               precision    recall  f1-score   support

         0.0       0.79      0.73      0.76        15
         1.0       0.75      0.80      0.77        15

    accuracy                           0.77        30
   macro avg       0.77      0.77      0.77        30
weighted avg       0.77      0.77      0.77        30



# Random Forest

In [4]:
param_grid_forest = {
    "n_estimators": np.linspace(100,300,20).astype(int),
    "min_samples_split": np.linspace(5, 15, 10).astype(int),
    "min_samples_leaf": np.linspace(2, 15, 10).astype(int),
    "max_depth": np.linspace(1, 10, 10).astype(int)
}

In [None]:
forest = RandomForestClassifier()
grid_forest = GridSearchCV(forest, param_grid_forest, cv=3, scoring="accuracy")
grid_forest.fit(X_train, Y_train)
best_forest = grid_forest.best_estimator_
best_param = grid_forest.best_params_
y_pred_forest = best_forest.predict(X_test)
print(best_param)

## Evaluation

In [5]:
print(confusion_matrix(Y_test, y_pred_forest),
      classification_report(Y_test, y_pred_forest))

print(accuracy_score(Y_test, y_pred_forest))

[[20 10]
 [ 6 24]]               precision    recall  f1-score   support

         0.0       0.77      0.67      0.71        30
         1.0       0.71      0.80      0.75        30

    accuracy                           0.73        60
   macro avg       0.74      0.73      0.73        60
weighted avg       0.74      0.73      0.73        60

0.7333333333333333
