In [1]:
import sys
sys.path.append('../helpers/')

import pickle

from pathlib import Path
import numpy as np
import pandas as pd
import os
from glob import glob
import natsort
import cv2
import csv
import random
from tqdm import tqdm

import matplotlib.pyplot as plt

# importing packages
from preprocessing import Preprocessing
from feature_extraction import FeatureExtraction

preprocessor = Preprocessing()
feature_extractor = FeatureExtraction()

# To allow auto reload to this notebook after modifying any external file imported
%load_ext autoreload
%autoreload 2

In [2]:
ROOT_PATH = Path(Path(os.getcwd())/"../challenge2")
TRAIN_PATH = ROOT_PATH/"train"
VAL_PATH = ROOT_PATH/"val"

train_bcc = sorted(glob(str(TRAIN_PATH/'bcc/*')))
train_mel = sorted(glob(str(TRAIN_PATH/'mel/*')))
train_scc = sorted(glob(str(TRAIN_PATH/'scc/*')))

val_bcc = sorted(glob(str(VAL_PATH/'bcc/*')))
val_mel = sorted(glob(str(VAL_PATH/'mel/*')))
val_scc = sorted(glob(str(VAL_PATH/'scc/*')))

### SAVING PICKLES OF PREPROCESSED IMAGES

In [3]:
# Pre-processing and feature extraction
prep_imgs_dir = r'../output/'

bcc_train_prep_filename    = 'bcc_train_prep_images.pkl'
bcc_val_prep_filename      = 'bcc_val_prep_images.pkl'

mel_train_prep_filename   = 'mel_train_prep_images.pkl'
mel_val_prep_filename     = 'mel_val_prep_images.pkl'

scc_train_prep_filename   = 'scc_train_prep_images.pkl'
scc_val_prep_filename     = 'scc_val_prep_images.pkl'


filenames_prep_list = [bcc_train_prep_filename, bcc_val_prep_filename, mel_train_prep_filename,  mel_val_prep_filename, scc_train_prep_filename, scc_val_prep_filename]
dir_list = [train_bcc, val_bcc, train_mel, val_mel, train_scc, val_scc]

In [14]:
subsample               = False

for index, filename in enumerate(filenames_prep_list):

    preprocessed_images = []

    for count, image_path in tqdm(enumerate(dir_list[index])):

        if subsample:
            if count == 999: # only 1k per class
                break

        # reading the image 
        image = cv2.imread(image_path,cv2.IMREAD_COLOR)

        # 1. Copping black frame
        image_without_black_frame, _ = preprocessor.crop_frame(image)

        # 2. Resizing
        image_resized = preprocessor.resize_images(image_without_black_frame, preserve_ratio=True)

        # 3. Removing hair
        image_without_hair = preprocessor.extract_hair(image_resized)

        # Saving the preprocessed image to a list
        preprocessed_images.append(image_without_hair)

    # Saving the preprocessed images to a file
    with open(prep_imgs_dir+filename, 'wb') as file:
        pickle.dump(preprocessed_images, file)

1993it [13:20,  2.49it/s]
  if mean_outside / mean_inside < threshold:
498it [02:54,  2.85it/s]
376it [02:09,  2.89it/s]
94it [00:36,  2.59it/s]


### EXTRACTING FEATURES FROM PREPROCESSED IMAGES SAVED IN PICKLES

In [17]:
# LOADING PICKLES 

with open(os.path.join(prep_imgs_dir, mel_train_prep_filename), 'rb') as file:
    mel_train_prep_images = pickle.load(file)
with open(os.path.join(prep_imgs_dir, mel_val_prep_filename), 'rb') as file:
    mel_val_prep_images = pickle.load(file)

with open(os.path.join(prep_imgs_dir, bcc_train_prep_filename), 'rb') as file:
    bcc_train_prep_images = pickle.load(file)
with open(os.path.join(prep_imgs_dir, bcc_val_prep_filename), 'rb') as file:
    bcc_val_prep_images = pickle.load(file)

with open(os.path.join(prep_imgs_dir, scc_train_prep_filename), 'rb') as file:
    scc_train_prep_images = pickle.load(file)
with open(os.path.join(prep_imgs_dir, scc_val_prep_filename), 'rb') as file:
    scc_val_prep_images = pickle.load(file)

In [20]:
# Pre-processing and feature extraction
features_dir = r'../output/features/'

experiment              = 0
subsample               = False

bcc_train_filename    = f'{experiment}_bcc_train_features.csv'
bcc_val_filename      = f'{experiment}_bcc_val_features.csv'

mel_train_filename    = f'{experiment}_mel_train_features.csv'
mel_val_filename      = f'{experiment}_mel_val_features.csv'

scc_train_filename   = f'{experiment}_scc_train_features.csv'
scc_val_filename     = f'{experiment}_scc_val_features.csv'

filenames_list = [bcc_train_filename, bcc_val_filename, mel_train_filename, mel_val_filename, scc_train_filename, scc_val_filename]
images_lists = [bcc_train_prep_images, bcc_val_prep_images, mel_train_prep_images, mel_val_prep_images, scc_train_prep_images, scc_val_prep_images]

labels = [0, 0, 1, 1, 2, 2] # bcc = 0, mel = 1, scc = 2


# Loop through the lists of images and their corresponding filenames
for filename, image_list, label in zip(filenames_list, images_lists, labels):
    with open(os.path.join(features_dir, filename), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)

        for count, preprocessed_image in tqdm(enumerate(image_list)):
            if subsample and count == 999:  # Only 1k per class
                break

            # 5. Extracting features
            feature_vector = feature_extractor.fit(preprocessed_image)

            # 6. Add label column
            feature_vector = np.append(feature_vector, label)

            # Write the feature vector to the CSV file
            writer.writerow(feature_vector)

376it [09:59,  1.59s/it]
94it [02:15,  1.44s/it]


### PREPARING THE DATA FOR MACHINE LEARNING

In [4]:
import pandas as pd

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix, balanced_accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, roc_curve


from feature_selection import FeatureSelection

select_feature = FeatureSelection()

In [5]:
features_dir = r'../output/features/'
experiment              = 0

train_bcc_df = pd.read_csv(os.path.join(features_dir,f'{experiment}_bcc_train_features.csv'),  header = None)
val_bcc_df = pd.read_csv(os.path.join(features_dir, f'{experiment}_bcc_val_features.csv'),  header = None)

train_mel_df = pd.read_csv(os.path.join(features_dir,f'{experiment}_mel_train_features.csv'),  header = None)
val_mel_df = pd.read_csv(os.path.join(features_dir, f'{experiment}_mel_val_features.csv'),  header = None)

train_scc_df = pd.read_csv(os.path.join(features_dir,f'{experiment}_scc_train_features.csv'),  header = None)
val_scc_df = pd.read_csv(os.path.join(features_dir, f'{experiment}_scc_val_features.csv'),  header = None)

In [6]:
# Concatenate the 'bcc', 'mel' and 'scc' dataframes
train_features = pd.concat([train_bcc_df, train_mel_df, train_scc_df], ignore_index=True)
val_features = pd.concat([val_bcc_df, val_mel_df, val_scc_df], ignore_index=True)

# Shuffle the dataset
train_features = train_features.sample(frac=1, random_state=42)
val_features = val_features.sample(frac=1, random_state=42)

In [7]:
X_train = train_features.iloc[:,:-1]
y_train = train_features.iloc[:,-1]

X_val = val_features.iloc[:,:-1]
y_val = val_features.iloc[:,-1]

In [8]:
# Standardizing the training data (mean = 0, std = 1)
scaler = StandardScaler()
X_train_normalized = pd.DataFrame(scaler.fit_transform(X_train))
X_val_normalized = pd.DataFrame(scaler.transform(X_val))

In [9]:
# Encoding the label column
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_val_encoded = encoder.transform(y_val)

### CLASSIFICATION

In [13]:
def compare_models(models, param_grids, X_train, y_train, X_val, y_val, scoring_metric='accuracy', cv=10):
    all_models = []
    
    for model, param_grid in zip(models, param_grids):
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, verbose=2, scoring=scoring_metric)
        grid_search.fit(X_train, y_train)

        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        
        # Store the best model and its parameters
        all_models.append({'model': best_model, 'params': best_params})
    
    # Evaluate the best models on a separate validation set
    models_report = []
    for model_info in all_models:
        model = model_info['model']
        y_pred = model.predict(X_val)

        accuracy = accuracy_score(y_val, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_val, y_pred)
        weighted_f1 = f1_score(y_val, y_pred, average='weighted')

        models_report = []
        models_report.append({
            'model': model, 
            'best_params': best_params, #model_info['params'], 
            'accuracy': accuracy, 
            'balanced_accuracy': balanced_accuracy,
            'weighted_f1': weighted_f1})


        conf_matrix = multilabel_confusion_matrix(y_val_encoded, y_pred)
        classes = ['bcc', 'mel', 'scc']
        for i, class_label in enumerate(classes):
            tp = conf_matrix[i, 1, 1]
            fp = conf_matrix[i, 0, 1]
            fn = conf_matrix[i, 1, 0]
            tn = conf_matrix[i, 0, 0]

            specificity = tn / (tn + fp)
            sensitivity = tp / (tp + fn)
            precision = tp / (tp + fp)
            f1 = 2 * (precision * sensitivity) / (precision + sensitivity)

            binary_true_labels = np.where(np.array(y_val) == i, 1, 0)
            binary_predicted_labels = np.where(np.array(y_pred) == i, 1, 0)
            fpr, tpr, _ = roc_curve(binary_true_labels, binary_predicted_labels)
            roc_auc = auc(fpr, tpr)

            models_report.append({
                f'{class_label}_specificity': specificity,
                f'{class_label}_sensitivity': sensitivity,
                f'{class_label}_precision': precision,
                f'{class_label}_f1': f1,
                f'{class_label}_roc_auc': roc_auc
            })

        
        #models_report.sort(key=lambda x: x['accuracy'], reverse=True)
    
    # return best_model, best_score, best_params, all_models
    return models_report

In [14]:
# Define the list of models and their respective hyperparameter grids
models = [
    #SVC(decision_function_shape='ovr', class_weight='balanced'),
    LogisticRegression(multi_class = 'ovr', class_weight='balanced')
    #KNeighborsClassifier(),
    #RandomForestClassifier(),
    #LinearDiscriminantAnalysis(),
    #XGBClassifier()
    ] 
        

param_grids = [
    #{'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10], 'gamma':[0.001, 0.01, 0.1]}
    {'penalty' : ['l1', 'l2'], 'C':[0.1, 1, 10], 'solver': ['liblinear']},
    #{'n_neighbors': [3, 5, 7, 9], 'weights' : ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'p': [1, 2]}, 
    #{'n_estimators' : [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}, # still takes a lot of time
    #{'solver': ['svd', 'eigen'], 'shrinkage': [None, 'auto', 0.1, 0.5, 1.0]},
    #{'max_depth': [4, 5], 'learning_rate': [0.1, 0.01], 'n_estimators': [200, 300], 'subsample': [0.5, 0.7, 1.0], 'colsample_bytree': [0.8, 0.9, 1.0]}

]

# Call the function to compare models and get the best one
models_report = compare_models(models, param_grids, X_train_normalized, y_train_encoded, X_val_normalized, y_val_encoded, scoring_metric='f1_weighted', cv=5)

# Save the best model to a file
# best_model_filename = ROOT_PATH/'best_model.pkl'
# with open(best_model_filename, 'wb') as file:
#     pickle.dump(best_model, file)

print("Model Report:", models_report)
# print("Best Score:", best_score)
# print("Best Params:", best_params)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   1.4s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   2.7s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   3.6s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   3.0s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   2.4s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   1.5s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   1.3s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   1.1s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   2.2s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.8s
[CV] END ..................C=1, penalty=l1, solver=liblinear; total time=  59.5s
[CV] END ..................C=1, penalty=l1, solve



[CV] END ..................C=1, penalty=l1, solver=liblinear; total time=  40.5s




[CV] END ..................C=1, penalty=l1, solver=liblinear; total time= 1.0min




[CV] END ..................C=1, penalty=l1, solver=liblinear; total time=  44.1s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   3.6s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   3.7s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   2.4s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   1.9s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   3.6s




[CV] END .................C=10, penalty=l1, solver=liblinear; total time= 2.5min




[CV] END .................C=10, penalty=l1, solver=liblinear; total time= 2.8min




[CV] END .................C=10, penalty=l1, solver=liblinear; total time= 3.2min




[CV] END .................C=10, penalty=l1, solver=liblinear; total time= 2.2min




[CV] END .................C=10, penalty=l1, solver=liblinear; total time= 2.8min
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   5.9s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   8.3s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   3.4s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   6.9s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   5.5s
Model Report: [{'model': LogisticRegression(C=1, class_weight='balanced', multi_class='ovr',
                   penalty='l1', solver='liblinear'), 'best_params': {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}, 'accuracy': 0.7724409448818897, 'balanced_accuracy': 0.6185914813462096, 'weighted_f1': 0.7676732965180896}, {'bcc_specificity': 0.8160621761658031, 'bcc_sensitivity': 0.8152610441767069, 'bcc_precision': 0.7408759124087592, 'bcc_f1': 0.7762906309751435, 'bcc_roc_auc': 0.8156616101712549}, {'mel_speci

In [55]:
print("Model Report:", models_report)

Model Report: [{'model': LogisticRegression(C=0.1, class_weight='balanced', multi_class='ovr'), 'best_params': {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}, 'accuracy': 0.7480314960629921, 'balanced_accuracy': 0.6123697703006844, 'weighted_f1': 0.754554307148425}, {'bcc_specificity': 0.8290155440414507, 'bcc_sensitivity': 0.7791164658634538, 'bcc_precision': 0.7461538461538462, 'bcc_f1': 0.7622789783889982, 'bcc_roc_auc': 0.8040660049524524}, {'mel_specificity': 0.8429054054054054, 'mel_sensitivity': 0.7920353982300885, 'mel_precision': 0.8523809523809524, 'mel_f1': 0.8211009174311927, 'mel_roc_auc': 0.8174704018177469}, {'scc_specificity': 0.91921768707483, 'scc_sensitivity': 0.26595744680851063, 'scc_precision': 0.20833333333333334, 'scc_f1': 0.2336448598130841, 'scc_roc_auc': 0.5925875669416703}]


In [12]:
# Save the best model to a file
best_model_filename = r'../models/best_model_multiclassSVM.pkl'
with open(best_model_filename, 'wb') as file:
     pickle.dump(best_model, file)