In [1]:
import os
os.chdir("..")


In [2]:
from skin_lesion_cad.utils.data_utils import get_class
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import functools as ft
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import xgboost as xgb
import time
import warnings
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, accuracy_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from tqdm import tqdm


mpl.rcParams['figure.dpi'] = 200


In [3]:
feature_dir = "data/processed/features"
CHALLENGE = "chall1"


In [4]:

def read_feats(color="raw"):
    save_path = Path("data/processed/features")
    
    glcm_whole_image_df_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_glcm_original_image.feather"))
    glcm_whole_image_df_train["image"] = glcm_whole_image_df_train["image"].apply(
        lambda x: x.split("/")[-1].split(".")[0])
    
    
    lbp_whole_image_df_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_lbp_original_image.feather"))
    lbp_whole_image_df_train["image"] = lbp_whole_image_df_train["image"].apply(
        lambda x: x.split("/")[-1].split(".")[0])
    
    shape_df_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_shape.feather"))
    shape_df_train["image"] = shape_df_train["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])
    
    color_df_raw_train = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_train_color_raw.feather"))
    color_df_raw_train["image"] = color_df_raw_train["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])
    
    color_df_ms_train = pd.read_feather(
    save_path/Path(f"{CHALLENGE}_train_color_meanshift.feather"))
    color_df_ms_train["image"] = color_df_ms_train["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])

    
    
    
    glcm_whole_image_df_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_glcm_original_image.feather"))
    glcm_whole_image_df_val["image"] = glcm_whole_image_df_val["image"].apply(
        lambda x: x.split("/")[-1].split(".")[0])
    
    
    lbp_whole_image_df_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_lbp_original_image.feather"))
    lbp_whole_image_df_val["image"] = lbp_whole_image_df_val["image"].apply(
        lambda x: x.split("/")[-1].split(".")[0])
    
    shape_df_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_shape.feather"))
    shape_df_val["image"] = shape_df_val["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])

    
    color_df_raw_val = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_val_color_raw.feather"))
    color_df_raw_val["image"] = color_df_raw_val["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])

    color_df_ms_val = pd.read_feather(
    save_path/Path(f"{CHALLENGE}_val_color_meanshift.feather"))
    color_df_ms_val["image"] = color_df_ms_val["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])

    
    
    glcm_whole_image_df_test = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_test_glcm_original_image.feather"))
    glcm_whole_image_df_test["image"] = glcm_whole_image_df_test["image"].apply(
        lambda x: x.split("/")[-1].split(".")[0])
    
    
    lbp_whole_image_df_test = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_test_lbp_original_image.feather"))
    lbp_whole_image_df_test["image"] = lbp_whole_image_df_test["image"].apply(
        lambda x: x.split("/")[-1].split(".")[0])

    shape_df_test = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_test_shape.feather"))
    shape_df_test["image"] = shape_df_test["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])

    
    color_df_raw_test = pd.read_feather(
        save_path/Path(f"{CHALLENGE}_test_color_raw.feather"))
    color_df_raw_test["image"] = color_df_raw_test["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])

    color_df_ms_test = pd.read_feather(
    save_path/Path(f"{CHALLENGE}_test_color_meanshift.feather"))
    color_df_ms_test["image"] = color_df_ms_test["image"].apply(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])

    
    if color=="raw":
        dfs = [  # glcm_df_train, lbp_df_train,
            color_df_raw_train,
            glcm_whole_image_df_train, lbp_whole_image_df_train, shape_df_train]
        all_feat_raw_train = ft.reduce(
            lambda left, right: pd.merge(left, right, on='image'), dfs)

        dfs = [  # glcm_df_val, lbp_df_val,
            color_df_raw_val,
            glcm_whole_image_df_val, lbp_whole_image_df_val, shape_df_val]
        all_feat_raw_val = ft.reduce(
            lambda left, right: pd.merge(left, right, on='image'), dfs)

        dfs = [  # glcm_df_val, lbp_df_val,
            color_df_raw_test,
            glcm_whole_image_df_test, lbp_whole_image_df_test, shape_df_test]
        all_feat_raw_test = ft.reduce(
            lambda left, right: pd.merge(left, right, on='image'), dfs)
        return all_feat_raw_train, all_feat_raw_val, all_feat_raw_test

    elif color=="meanshift":
        dfs = [  # glcm_df_train, lbp_df_train,
            color_df_ms_train,
            glcm_whole_image_df_train, lbp_whole_image_df_train, shape_df_train]
        all_feat_raw_train = ft.reduce(
            lambda left, right: pd.merge(left, right, on='image'), dfs)

        dfs = [  # glcm_df_val, lbp_df_val,
            color_df_ms_val,
            glcm_whole_image_df_val, lbp_whole_image_df_val, shape_df_val]
        all_feat_raw_val = ft.reduce(
            lambda left, right: pd.merge(left, right, on='image'), dfs)

        dfs = [  # glcm_df_val, lbp_df_val,
            color_df_ms_test,
            glcm_whole_image_df_test, lbp_whole_image_df_test, shape_df_test]
        all_feat_raw_test = ft.reduce(
            lambda left, right: pd.merge(left, right, on='image'), dfs)
        return all_feat_raw_train, all_feat_raw_val, all_feat_raw_test



In [5]:
all_feat_train, all_feat_val, all_feat_test = read_feats()


In [25]:
all_feat_train.to_feather("chall1_all_feat_train_final.csv")
all_feat_val.to_feather("chall1_all_feat_val_final.csv")

In [6]:
all_feat_train.shape, all_feat_val.shape, all_feat_test.shape


((15195, 359), (3796, 359), (6340, 359))

In [7]:
all_feat_Train = pd.concat([all_feat_train, all_feat_val])


In [8]:
X_train = all_feat_Train.drop(columns='image')
y_train = all_feat_Train['image'].apply(get_class, chall=CHALLENGE)
X_test = all_feat_test.drop(columns='image')


In [9]:
X_train[np.isnan(X_train)] = 0
X_train[np.isinf(X_train)] = 0

X_test[np.isnan(X_test)] = 0
X_test[np.isinf(X_test)] = 0

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

pca = PCA(n_components=150)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


In [10]:
xgb_params = {'base_score': 0.5,
              'booster': 'gbtree',
              'callbacks': None,
              'colsample_bylevel': 1,
              'colsample_bynode': 1,
              'colsample_bytree': 1,
              'early_stopping_rounds': None,
              'enable_categorical': False,
              'eval_metric': 'auc',
              'gamma': 0.4,
              'gpu_id': -1,
              'grow_policy': 'lossguide',
              'importance_type': None,
              'interaction_constraints': '',
              'learning_rate': 0.15,
              'max_bin': 256,
              'max_cat_to_onehot': 4,
              'max_delta_step': 0,
              'max_depth': 10,
              'max_leaves': 0,
              'min_child_weight': 1,
              'monotone_constraints': '()',
              'n_estimators': 100,
              'n_jobs': 0,
              'num_parallel_tree': 1,
              'objective': 'binary:logistic',
              'predictor': 'auto',
              'random_state': 0,
              'reg_alpha': 1.6,
              'reg_lambda': 3.2,
              'sampling_method': 'uniform',
              'scale_pos_weight': 1,
              'subsample': 1,
              'tree_method': 'hist',
              'use_label_encoder': False,
              'validate_parameters': 1,
              'verbosity': None}

In [11]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
clf1 = SVC(kernel='rbf', probability=True,
           class_weight='balanced', C=100, gamma=0.001)
clf2 = xgb.XGBClassifier(**xgb_params)
eclf = VotingClassifier(
    estimators=[('svm', clf1), ('xgb', clf2)], voting='soft')
for clf, label in tqdm(zip([clf1, clf2, eclf], ['SVM', 'XGB', 'Ensemble'])):
    scores = cross_val_score(clf, X_train_pca, y_train,
                             scoring='balanced_accuracy', cv=5)
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" %
          (scores.mean(), scores.std(), label))


1it [12:07, 727.66s/it]

Accuracy: 0.8380 (+/- 0.0068) [SVM]


2it [13:04, 332.83s/it]

Accuracy: 0.8288 (+/- 0.0058) [XGB]


3it [27:33, 551.30s/it]

Accuracy: 0.8410 (+/- 0.0066) [Ensemble]





In [12]:
eclf.fit(X_train_pca,y_train)
pred = eclf.predict(X_test_pca)

In [17]:
clf1.fit(X_train_pca,y_train)
pred_svm = clf1.predict(X_test_pca)

In [14]:
pred_name = all_feat_test["image"].apply(lambda x: x.split("/")[-1].split("_")[0])

In [15]:
pred_df = pd.DataFrame(pred, pred_name, columns=["pred"]).reset_index(drop=False)

In [23]:
pred_df.rename(columns={"pred":"pred_xgb+svm"})

Unnamed: 0,image,pred_xgb+svm,pred_svm
0,xxx03908,0,0
1,xxx05677,1,1
2,xxx02725,0,0
3,xxx02891,1,1
4,xxx01938,0,0
...,...,...,...
6335,xxx01129,1,1
6336,xxx02360,1,1
6337,xxx02080,1,0
6338,xxx03119,1,1


In [24]:
pred_df.to_csv("chall1_pred_updated_test_set.csv")

In [19]:
pred_df['pred_svm'] = pred_svm

In [21]:
accuracy_score(pred_df['pred'], pred_df['pred_svm'])

0.9487381703470031