In [1]:
import os
os.chdir("..")


In [2]:
from skin_lesion_cad.utils.data_utils import get_class
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import functools as ft
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import xgboost as xgb
import time
import warnings
from pathlib import Path
mpl.rcParams['figure.dpi'] = 200


In [3]:
feature_dir = "data/processed/features"
CHALLENGE = "chall1"


In [18]:

def read_feats(color="raw"):
    save_path = Path("data/processed/features")

    glcm_df_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_glcm.feather"))

    lbp_df_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_lbp.feather"))

    glcm_masked_df_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_glcm_masked.feather"))
    glcm_masked_df_train.columns = [
        "masked_"+i if i != "image" else i for i in glcm_masked_df_train.columns]

    lbp_masked_df_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_lbp_masked.feather"))
    lbp_masked_df_train.columns = [
        "masked_"+i if i != "image" else i for i in lbp_masked_df_train.columns]

    shape_df_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_shape.feather"))

    color_df_raw_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_color_raw.feather"))
    color_df_ms_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_color_meanshift.feather"))

    glcm_df_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_glcm.feather"))
    lbp_df_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_lbp.feather"))
    shape_df_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_shape.feather"))
    print(glcm_df_val["image"])
    glcm_masked_df_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_glcm_masked.feather"))
    glcm_masked_df_val.columns = [
        "masked_"+i if i != "image" else i for i in glcm_masked_df_val.columns]

    lbp_masked_df_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_lbp_masked.feather"))
    lbp_masked_df_val.columns = [
        "masked_"+i if i != "image" else i for i in lbp_masked_df_val.columns]

    color_df_raw_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_color_raw.feather"))
    color_df_ms_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_color_meanshift.feather"))

    dfs = [glcm_df_train, lbp_df_train, color_df_raw_train,
           glcm_masked_df_train, lbp_masked_df_train, shape_df_train]
    all_feat_raw_train = ft.reduce(
        lambda left, right: pd.merge(left, right, on='image'), dfs)

    dfs = [glcm_df_train, lbp_df_train, color_df_ms_train,
           glcm_masked_df_train, lbp_masked_df_train, shape_df_train]
    all_feat_ms_train = ft.reduce(
        lambda left, right: pd.merge(left, right, on='image'), dfs)

    dfs = [glcm_df_val, lbp_df_val, color_df_raw_val,
           glcm_masked_df_val, lbp_masked_df_val, shape_df_val]
    all_feat_raw_val = ft.reduce(
        lambda left, right: pd.merge(left, right, on='image'), dfs)

    dfs = [glcm_df_val, lbp_df_val, color_df_ms_val,
           glcm_masked_df_val, lbp_masked_df_val, shape_df_val]
    all_feat_ms_val = ft.reduce(
        lambda left, right: pd.merge(left, right, on='image'), dfs)
    if color == "raw":
        return all_feat_raw_train, all_feat_raw_val
    elif color == "meanshift":
        return all_feat_ms_train, all_feat_ms_val


In [19]:
color = "raw"
all_feat_train, all_feat_val = read_feats("raw")

X_train = all_feat_train.drop(columns='image')
y_train = all_feat_train['image'].apply(get_class, chall=CHALLENGE)

# X_train, X_val, y_train, y_val = train_test_split(
#     X, y, test_size=0.3, random_state=42)
X_test = all_feat_val.drop(columns='image')
y_test = all_feat_val['image'].apply(get_class, chall=CHALLENGE)


0       data/processed/chall1/val/others/mel02811_inpa...
1       data/processed/chall1/val/others/bkl01760_inpa...
2       data/processed/chall1/val/others/scc00461_inpa...
3       data/processed/chall1/val/others/bcc02405_inpa...
4       data/processed/chall1/val/others/bkl01900_inpa...
                              ...                        
3791    data/processed/chall1/val/nevus/nev07970_inpai...
3792    data/processed/chall1/val/nevus/nev08398_inpai...
3793    data/processed/chall1/val/nevus/nev08303_inpai...
3794    data/processed/chall1/val/nevus/nev08137_inpai...
3795    data/processed/chall1/val/nevus/nev09035_inpai...
Name: image, Length: 3796, dtype: object


In [21]:
X_train.shape

(15195, 652)

In [22]:

X_train[np.isnan(X_train)] = 0
X_train[np.isinf(X_train)] = 0

X_test[np.isnan(X_test)] = 0
X_test[np.isinf(X_test)] = 0


In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
xgb_params = {'base_score': 0.5,
              'booster': 'gbtree',
              'callbacks': None,
              'colsample_bylevel': 1,
              'colsample_bynode': 1,
              'colsample_bytree': 1,
              'early_stopping_rounds': None,
              'enable_categorical': False,
              'eval_metric': 'auc',
              'gamma': 0.4,
              'gpu_id': -1,
              'grow_policy': 'lossguide',
              'importance_type': None,
              'interaction_constraints': '',
              'learning_rate': 0.15,
              'max_bin': 256,
              'max_cat_to_onehot': 4,
              'max_delta_step': 0,
              'max_depth': 10,
              'max_leaves': 0,
              'min_child_weight': 1,
              'monotone_constraints': '()',
              'n_estimators': 100,
              'n_jobs': 0,
              'num_parallel_tree': 1,
              'objective': 'binary:logistic',
              'predictor': 'auto',
              'random_state': 0,
              'reg_alpha': 1.6,
              'reg_lambda': 3.2,
              'sampling_method': 'uniform',
              'scale_pos_weight': 1,
              'subsample': 1,
              'tree_method': 'hist',
              'use_label_encoder': False,
              'validate_parameters': 1,
              'verbosity': None}


In [25]:
xgb_best = xgb.XGBClassifier(**xgb_params)
xgb_best.fit(X_train, y_train.values.ravel())
#predictions - inputs to confusion matrix
train_predictions = xgb_best.predict(X_train)
test_predictions = xgb_best.predict(X_test)
# unseen_predictions = clf.predict(df_test.iloc[:,1:])

#confusion matrices
cfm_train = confusion_matrix(y_train, train_predictions)
cfm_test = confusion_matrix(y_test, test_predictions)
# cfm_unseen = confusion_matrix(df_test.iloc[:,:1], unseen_predictions)

#accuracy scores
accs_train = accuracy_score(y_train, train_predictions)
accs_test = accuracy_score(y_test, test_predictions)
print(accs_train)
print(accs_test)
print(cfm_test)


0.9994735110233629
0.8395679662802951
[[1621  310]
 [ 299 1566]]


In [26]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, accuracy_score, f1_score

svc = SVC(kernel='rbf', probability=True, class_weight='balanced', C=1.0)
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

print(classification_report(y_test, y_pred))
print('Confusion Matrix\n', confusion_matrix(y_test, y_pred))
print('\nAccuracy: ', accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.80      0.82      1931
           1       0.80      0.83      0.81      1865

    accuracy                           0.82      3796
   macro avg       0.82      0.82      0.82      3796
weighted avg       0.82      0.82      0.82      3796

Confusion Matrix
 [[1553  378]
 [ 323 1542]]

Accuracy:  0.815331928345627


In [28]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = RandomizedSearchCV(SVC(), param_grid, refit=True, verbose=3)


# fitting the model for grid search
grid.fit(X_train, y_train)


# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

grid_predictions = grid.predict(X_test)

# print classification report
print(classification_report(y_test, grid_predictions))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.513 total time= 1.4min
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.511 total time= 1.4min
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.512 total time= 1.6min
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.512 total time= 1.6min
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.512 total time= 1.6min
[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.824 total time=  29.3s
[CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.823 total time=  29.4s
[CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.830 total time=  29.6s
[CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.823 total time=  29.4s
[CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.824 total time=  34.5s
[CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.816 total time=  58.6s
[CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;

In [29]:
print('Confusion Matrix\n', confusion_matrix(y_test, grid_predictions))
print('\nAccuracy: ', accuracy_score(y_test, grid_predictions))


Confusion Matrix
 [[1618  313]
 [ 320 1545]]

Accuracy:  0.833245521601686


In [30]:
print(grid.best_params_)


{'kernel': 'rbf', 'gamma': 0.001, 'C': 10}
