In [1]:
if "Intel" in cpuinfo.get_cpu_info()['brand_raw']:
    from sklearnex import patch_sklearn
    patch_sklearn()

import os
os.chdir("..")

import random
from multiprocessing.dummy import Pool
from pathlib import Path
from typing import Counter

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from skin_lesion_cad.data.BOVW import (BagofWords, ColorDescriptor,
                                       DenseDescriptor, DescriptorsTransformer,
                                       LBPDescriptor)
from skin_lesion_cad.features.colour import (ColorFeaturesDescriptor,
                                             ColorFeaturesExtractor)
from skin_lesion_cad.features.texture import get_glcm, glcm_features, lbph
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics import (accuracy_score, classification_report,
                             cohen_kappa_score, confusion_matrix, f1_score,
                             precision_score)
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.svm import SVC
from tqdm import tqdm

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
KP_SIZE = 25
color_spaces = {'bgr':cv2.COLOR_RGB2BGR, 'hsv':cv2.COLOR_RGB2HSV, 'YCrCb':cv2.COLOR_RGB2YCrCb}

root_path = Path().resolve()

chall2_train = root_path/"data/processed/chall2/train"
mel_imgs_all = np.array(list((chall2_train/'mel').glob("*inpaint_0_5.png")))
bcc_imgs_all = np.array(list((chall2_train/'bcc').glob("*inpaint_0_5.png")))
scc_imgs_all = np.array(list((chall2_train/'scc').glob("*inpaint_0_5.png")))
test_imgs_train = np.concatenate([mel_imgs_all, bcc_imgs_all, scc_imgs_all])

chall2_val = root_path/"data/processed/chall2/val"
mel_imgs_all = np.array(list((chall2_val/'mel').glob("*inpaint_0_5.png")))
bcc_imgs_all = np.array(list((chall2_val/'bcc').glob("*inpaint_0_5.png")))
scc_imgs_all = np.array(list((chall2_val/'scc').glob("*inpaint_0_5.png")))
test_imgs_val = np.concatenate([mel_imgs_all, bcc_imgs_all, scc_imgs_all])

print(f'Train images {len(test_imgs_train)}, Validation images {len(test_imgs_val)}')

Train images 5082, Validation images 1270


## Getting Color Descriptors for Images for BoW

Either rung the section below or load the saved pickle file.from one below it.

### Calculate features from scratch

In [None]:
brisk = cv2.BRISK_create(thresh=30, octaves=0)

# define descriptors used for BoW
# use gaussian sampling
dense_desc = DenseDescriptor(descriptor=None, min_keypoints=100,
                             max_keypoints=500, kp_size=KP_SIZE,
                             sample_method='gaussian')


dense_color = ColorDescriptor(dense_desc, color_spaces, meanshift=None,
                              min_keypoints=100, max_keypoints=500,
                              kp_size=25,
                              sample_method='gaussian')

In [None]:
def _load_and_extract_des_color(x):
    image_path, descriptor = x
    im = cv2.imread(str(image_path))
    if 'bcc' in str(image_path):
        img_cls = 0
    elif 'mel' in str(image_path):
        img_cls = 1
    else:
        img_cls = 2

    kpts, des = descriptor.detectAndCompute(im, None)
    return (des, img_cls, image_path.name)

In [None]:
descriptors_color_train = []
img_classes_color_train = []
img_names_color_train = []

# COLOR FOR TRAIN
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(_load_and_extract_des_color,
                                       zip(test_imgs_train,
                                           [dense_color]*len(test_imgs_train))), total=len(test_imgs_train)):
        descriptors_color_train.append(des)
        img_classes_color_train.append(img_cls)
        img_names_color_train.append(img_name)
        
pd.to_pickle((descriptors_color_train, img_classes_color_train, img_names_color_train), 'descriptors_color_train_all_train.pkl')

In [None]:
descriptors_color_val = []
img_classes_color_val = []
img_names_color_val = []

# COLOR FOR VALIDATION
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(_load_and_extract_des_color,
                                       zip(test_imgs_val,
                                           [dense_color]*len(test_imgs_val))), total=len(test_imgs_val)):
        descriptors_color_val.append(des)
        img_classes_color_val.append(img_cls)
        img_names_color_val.append(img_name)
        
pd.to_pickle((descriptors_color_val, img_classes_color_val, img_names_color_val), 'descriptors_color_val_all_val_gauss.pkl')

### Load the dump

In [4]:
descriptors_color_train, img_classes_color_train, img_names_color_train = pd.read_pickle('descriptors_color_all_train.pkl')
descriptors_color_val, img_classes_color_val, img_names_color_val = pd.read_pickle('descriptors_color_all_val_gauss.pkl')
print(len(descriptors_color_train), len(descriptors_color_val))

5082 1270


## Creating and calculating BoW

In [5]:
descriptors_color_train = np.asarray(descriptors_color_train)
descriptors_color_val = np.asarray(descriptors_color_val)


bow_color = BagofWords(n_words=100, n_jobs=-1, random_state=42)
train_BoWed = bow_color.fit_transform(descriptors_color_train, img_classes_color_train) 
val_BoWed = bow_color.transform(descriptors_color_val)

# creating datafrains that contain the BoW features and the image class for all images
bow_train = pd.DataFrame(train_BoWed.toarray(), columns=[f'bow_{i}' for i in range(train_BoWed.shape[1])])
bow_val = pd.DataFrame(val_BoWed.toarray(), columns=[f'bow_{i}' for i in range(val_BoWed.shape[1])])

bow_train['class'] = img_classes_color_train
bow_val['class'] = img_classes_color_val

bow_train['name'] = img_names_color_train
bow_val['name'] = img_names_color_val


# classification and evaluation
svc = SVC(kernel='rbf', C=1, random_state=42, probability=False, class_weight='balanced')
svc.fit(bow_train.drop(['class', 'name'], axis=1), bow_train['class'])
y_pred = svc.predict(bow_val.drop(['class', 'name'], axis=1))
y_pred_train = svc.predict(bow_train.drop(['class', 'name'], axis=1))

print(confusion_matrix(bow_val['class'], y_pred))
print('Train f1_weighted',f1_score(bow_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(bow_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(bow_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(bow_val['class'], y_pred))

[[399  98   1]
 [126 552   0]
 [ 63  26   5]]
Train f1_weighted 0.785624708932611
Validation f1_weighted 0.7308260342898841
Train kappa 0.6381381740298608
Validation kappa 0.5369398409631391


## Extracting color features (global)

In [6]:
def extract_color(image_path):
    cfe = ColorFeaturesExtractor(color_spaces)
    im = cv2.imread(str(image_path))
    im = im[im.shape[0]//4:im.shape[0]*3//4, im.shape[1]//4:im.shape[1]*3//4, :]

    
    mask = np.ones_like(im[:,:,0]) #cv2.imread(str(image_path).replace('inpaint', 'mask'))
    if 'bcc' in str(image_path):
        img_cls = 0
    elif 'mel' in str(image_path):
        img_cls = 1
    else:
        img_cls = 2

    des = cfe.extract_masked(im, mask)

    return (des, img_cls, image_path.name)

colf_train = []
colc_train = []
colnam_train = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_color,
                                       test_imgs_train), total=len(test_imgs_train)):
        colf_train.append(des)
        colc_train.append(img_cls)
        colnam_train.append(img_name)

  1%|          | 49/5082 [00:00<00:46, 109.04it/s]

In [None]:
colf_val = []
colc_val = []
colnam_val = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_color,
                                       test_imgs_val), total=len(test_imgs_val)):
        colf_val.append(des)
        colc_val.append(img_cls)
        colnam_val.append(img_name)

100%|██████████| 1270/1270 [00:42<00:00, 30.09it/s]


In [None]:
color_df_train = pd.DataFrame(colf_train)
color_df_train['name'] = colnam_train
color_df_train['class'] = colc_train

color_df_val = pd.DataFrame(colf_val)
color_df_val['name'] = colnam_val
color_df_val['class'] = colc_val


scaler = DescriptorsTransformer(imputation=None)
X_train = scaler.fit_transform(color_df_train.drop(['class', 'name'], axis=1))
X_test = scaler.transform(color_df_val.drop(['class', 'name'], axis=1))

svc = SVC(kernel='rbf', C=1, random_state=42, probability=False, class_weight='balanced')
svc.fit(X_train, color_df_train['class'])
y_pred = svc.predict(X_test)
y_pred_train = svc.predict(X_train)

print(confusion_matrix(color_df_val['class'], y_pred))
print('Train f1_weighted',f1_score(color_df_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(color_df_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(color_df_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(color_df_val['class'], y_pred))

[[435  61   2]
 [ 64 613   1]
 [ 55  34   5]]
Train f1_weighted 0.8177386269055505
Validation f1_weighted 0.8037727515511731
Train kappa 0.7025700925383747
Validation kappa 0.6781362995076101


```
Preprocessed 

[[435  61   2]
 [ 62 616   0]
 [ 57  31   6]]
Train f1_weighted 0.8190181201191215
Validation f1_weighted 0.8075351154802809
Train kappa 0.706336521578109
Validation kappa 0.6840692709452578

```

## Extracting texture features

### GLCM

In [None]:
def extract_text(image_path):
    im = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
    if 'bcc' in str(image_path):
        img_cls = 0
    elif 'mel' in str(image_path):
        img_cls = 1
    else:
        img_cls = 2

    des = glcm_features(im)

    return (des, img_cls, image_path.name)

txtf_train = []
txtcl_train = []
txtnam_train = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_text,
                                       test_imgs_train), total=len(test_imgs_train)):
        txtf_train.append(des)
        txtcl_train.append(img_cls)
        txtnam_train.append(img_name)
        
txtf_val = []
txtcl_val = []
txtnam_val = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_text,
                                       test_imgs_val), total=len(test_imgs_val)):
        txtf_val.append(des)
        txtcl_val.append(img_cls)
        txtnam_val.append(img_name)

100%|██████████| 5082/5082 [03:17<00:00, 25.70it/s]
100%|██████████| 1270/1270 [00:47<00:00, 26.88it/s]


In [None]:
# unrevelling the dataframe
txtf_exp_train = []
for i in range(len(txtf_train)):
    res = dict()
    for feat in txtf_train[i].keys():
        for featvalidx, featval in enumerate(txtf_train[i][feat][0]):
            res[f'{feat}_{featvalidx}'] = featval
    txtf_exp_train.append(res)
    
txtf_exp_val = []
for i in range(len(txtf_val)):
    res = dict()
    for feat in txtf_val[i].keys():
        for featvalidx, featval in enumerate(txtf_val[i][feat][0]):
            res[f'{feat}_{featvalidx}'] = featval
    txtf_exp_val.append(res)
    
glcm_df_train = pd.DataFrame(txtf_exp_train)
glcm_df_train['name'] = txtnam_train
glcm_df_train['class'] = txtcl_train

glcm_df_val = pd.DataFrame(txtf_exp_val)
glcm_df_val['name'] = txtnam_val
glcm_df_val['class'] = txtcl_val

glcm_df_train

Unnamed: 0,contrast_0,contrast_1,contrast_2,contrast_3,contrast_4,contrast_5,contrast_6,contrast_7,contrast_8,contrast_9,...,ASM_12,ASM_13,ASM_14,ASM_15,ASM_16,ASM_17,ASM_18,ASM_19,name,class
0,41.987989,31.020672,50.765211,31.703470,117.009718,152.920052,137.938477,148.026931,178.592599,204.223031,...,0.026515,0.026282,0.026584,0.026243,0.025811,0.025288,0.025815,0.025245,mel01343.jpg,1
1,76.355886,43.792750,65.651775,52.692052,118.548239,114.326271,107.928300,122.639454,136.326918,127.007138,...,0.002152,0.002216,0.002174,0.002109,0.001988,0.001990,0.001971,0.001917,mel01227.jpg,1
2,16.526545,9.542996,15.696686,10.666185,41.803324,45.431693,42.570624,48.334715,61.743993,60.587828,...,0.102337,0.100986,0.102186,0.100939,0.100002,0.097187,0.099258,0.096659,mel01949.jpg,1
3,48.840398,37.976623,49.925166,30.026176,88.206350,98.276857,91.186063,90.198793,106.688174,110.918621,...,0.000384,0.000377,0.000372,0.000386,0.000335,0.000328,0.000327,0.000325,mel01053.jpg,1
4,22.129461,16.118936,26.501190,17.036345,45.301778,53.244109,51.498925,54.014548,59.115304,64.008211,...,0.000969,0.000948,0.000897,0.000910,0.000839,0.000798,0.000779,0.000784,mel01651.jpg,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5077,4.467907,2.552965,3.910403,3.266360,11.468224,11.022652,10.991300,13.631830,16.051150,14.319608,...,0.002262,0.002228,0.002204,0.002243,0.001855,0.001783,0.001790,0.001815,scc00018.jpg,2
5078,19.089717,14.352921,22.038526,13.077160,43.867134,58.279395,50.575674,46.675202,59.066682,71.300984,...,0.000709,0.000648,0.000660,0.000724,0.000573,0.000505,0.000535,0.000571,scc00129.jpg,2
5079,12.275377,8.191607,13.701198,8.656603,30.304224,35.452448,30.083798,31.448886,40.072655,42.468367,...,0.000997,0.000980,0.001002,0.001033,0.000864,0.000850,0.000863,0.000867,scc00113.jpg,2
5080,4.870508,3.315998,4.737967,3.053432,12.835820,14.917249,12.731582,13.762764,17.560410,18.368369,...,0.000906,0.000893,0.000894,0.000931,0.000741,0.000714,0.000727,0.000747,scc00124.jpg,2


In [None]:
scaler = DescriptorsTransformer(None)
X_train = scaler.fit_transform(glcm_df_train.drop(['class', 'name'], axis=1))
X_test = scaler.transform(glcm_df_val.drop(['class', 'name'], axis=1))

svc = SVC(kernel='rbf', C=1, random_state=42, probability=False, class_weight='balanced')
svc.fit(X_train, glcm_df_train['class'])
y_pred = svc.predict(X_test)
y_pred_train = svc.predict(X_train)

print(confusion_matrix(glcm_df_val['class'], y_pred))
print('Train f1_weighted',f1_score(glcm_df_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(glcm_df_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(glcm_df_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(glcm_df_val['class'], y_pred))

[[340 158   0]
 [157 521   0]
 [ 60  33   1]]
Train f1_weighted 0.6573103866898445
Validation f1_weighted 0.6545046019424062
Train kappa 0.3969091532627179
Validation kappa 0.3923188426193056


Processed

```
[[347 151   0]
 [147 531   0]
 [ 60  34   0]]
Train f1_weighted 0.6709994708774802
Validation f1_weighted 0.6653965548931365
Train kappa 0.42462073093515984
Validation kappa 0.41537883413969656
```

### LBP

In [None]:
def extract_lbp(image_path):
    im = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
    
    # select central half of the image
    im = im[im.shape[0]//4:im.shape[0]*3//4, im.shape[1]//4:im.shape[1]*3//4]
    if 'bcc' in str(image_path):
        img_cls = 0
    elif 'mel' in str(image_path):
        img_cls = 1
    else:
        img_cls = 2

    des = lbph(im)

    return (des, img_cls, image_path.name)

txtf_train = []
txtcl_train = []
txtnam_train = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_lbp,
                                       test_imgs_train), total=len(test_imgs_train)):
        txtf_train.append(des)
        txtcl_train.append(img_cls)
        txtnam_train.append(img_name)
        
txtf_val = []
txtcl_val = []
txtnam_val = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_lbp,
                                       test_imgs_val), total=len(test_imgs_val)):
        txtf_val.append(des)
        txtcl_val.append(img_cls)
        txtnam_val.append(img_name)
        
   


100%|██████████| 5082/5082 [02:18<00:00, 36.68it/s]
100%|██████████| 1270/1270 [00:35<00:00, 35.59it/s]


In [None]:
lbp_df_train = pd.DataFrame(txtf_train, columns=[f'lbp_{i}' for i in range(len(txtf_train[0]))])
lbp_df_train['name'] = txtnam_train
lbp_df_train['class'] = txtcl_train

lbp_df_val = pd.DataFrame(txtf_val, columns=[f'lbp_{i}' for i in range(len(txtf_train[0]))])
lbp_df_val['name'] = txtnam_val
lbp_df_val['class'] = txtcl_val

lbp_df_train

Unnamed: 0,lbp_0,lbp_1,lbp_2,lbp_3,lbp_4,lbp_5,lbp_6,lbp_7,lbp_8,lbp_9,...,lbp_142,lbp_143,lbp_144,lbp_145,lbp_146,lbp_147,lbp_148,lbp_149,name,class
0,0.027857,0.010952,0.010000,0.011429,0.011786,0.013512,0.016071,0.017024,0.016071,0.019881,...,0.057798,0.137679,0.202857,0.175655,0.091012,0.070714,0.073929,0.099405,mel02171_inpaint_0_5.png,1
1,0.043915,0.016174,0.013763,0.014893,0.011902,0.012543,0.011124,0.012360,0.012299,0.013962,...,0.058426,0.129028,0.167236,0.158142,0.099792,0.083542,0.084488,0.114365,mel01400_inpaint_0_5.png,1
2,0.032272,0.011810,0.009689,0.010071,0.010452,0.012543,0.013748,0.015564,0.016418,0.019989,...,0.038605,0.131165,0.179031,0.198639,0.096130,0.088089,0.097214,0.099976,mel00176_inpaint_0_5.png,1
3,0.028000,0.014404,0.010208,0.010422,0.009918,0.012131,0.011459,0.013794,0.013474,0.015793,...,0.037048,0.107910,0.118851,0.172272,0.097839,0.104355,0.134613,0.132614,mel02216_inpaint_0_5.png,1
4,0.030893,0.011548,0.012917,0.014583,0.012381,0.012083,0.014226,0.013036,0.014345,0.017321,...,0.054583,0.137381,0.210000,0.181071,0.085774,0.066726,0.075833,0.100655,mel00728_inpaint_0_5.png,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5077,0.020889,0.009964,0.007324,0.006836,0.006378,0.008377,0.008240,0.011292,0.010269,0.015030,...,0.027344,0.128677,0.183090,0.201706,0.082962,0.087830,0.122787,0.096771,scc00027_inpaint_0_5.png,2
5078,0.042857,0.012857,0.013571,0.016429,0.013869,0.017679,0.016607,0.016131,0.018631,0.017917,...,0.072083,0.136369,0.204286,0.167917,0.095179,0.063571,0.060417,0.098869,scc00376_inpaint_0_5.png,2
5079,0.037738,0.015357,0.013036,0.013512,0.012738,0.014048,0.013333,0.015357,0.015119,0.017262,...,0.080536,0.133869,0.156369,0.137976,0.087440,0.073988,0.075357,0.127262,scc00239_inpaint_0_5.png,2
5080,0.030792,0.014221,0.010391,0.010437,0.011429,0.012436,0.011932,0.013626,0.012604,0.016113,...,0.028625,0.112717,0.127106,0.184418,0.096832,0.104324,0.146729,0.117432,scc00130_inpaint_0_5.png,2


In [None]:
scaler = DescriptorsTransformer(None)
X_train = scaler.fit_transform(lbp_df_train.drop(['class', 'name'], axis=1))
X_test = scaler.transform(lbp_df_val.drop(['class', 'name'], axis=1))

# X_train = lbp_df_train.drop(['class', 'name'], axis=1)
# X_test = lbp_df_val.drop(['class', 'name'], axis=1)

svc = SVC(kernel='rbf', C=1, random_state=42, probability=False, class_weight='balanced')
svc.fit(X_train, lbp_df_train['class'])
y_pred = svc.predict(X_test)
y_pred_train = svc.predict(X_train)

print(confusion_matrix(lbp_df_val['class'], y_pred))
print('Train f1_weighted',f1_score(lbp_df_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(lbp_df_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(lbp_df_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(lbp_df_val['class'], y_pred))

[[358 140   0]
 [ 94 584   0]
 [ 43  51   0]]
Train f1_weighted 0.7284629130367141
Validation f1_weighted 0.7118855491264938
Train kappa 0.5373175193643275
Validation kappa 0.504649558827027


# Merging BoW and global faetures

In [None]:
merged_df_train = pd.merge(color_df_train.drop(columns=['class']), glcm_df_train, on='name', how='inner')
merged_df_train = pd.merge(merged_df_train, lbp_df_train.drop(columns=['class']), on='name', how='inner').set_index('name')
merged_df_train = pd.merge(merged_df_train, bow_train.drop(columns=['class']), on='name', how='inner').set_index('name')

merged_df_val = pd.merge(color_df_val.drop(columns=['class']), glcm_df_val, on='name', how='inner')
merged_df_val = pd.merge(merged_df_val, lbp_df_val.drop(columns=['class']), on='name', how='inner').set_index('name')
merged_df_val = pd.merge(merged_df_val, bow_val.drop(columns=['class']), on='name', how='inner').set_index('name')


Evaluating the performance of merged features

In [None]:
X_train = merged_df_train.drop(columns=['class'])
X_test = merged_df_val.drop(columns=['class'])
y_train = merged_df_train['class']
y_test = merged_df_val['class']

scaler = DescriptorsTransformer(None)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
y_pred_train = svc.predict(X_train)

print(confusion_matrix(glcm_df_val['class'], y_pred))
print('Train f1_weighted',f1_score(glcm_df_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(glcm_df_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(glcm_df_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(glcm_df_val['class'], y_pred))

[[449  47   2]
 [ 74 603   1]
 [ 56  27  11]]
Train f1_weighted 0.8551313288300627
Validation f1_weighted 0.8171845388739531
Train kappa 0.7644150306930817
Validation kappa 0.6958122549859067


In [None]:
# merged_df_train.to_pickle('merged_df_train.pkl')
# merged_df_val.to_pickle('merged_df_val.pkl')

# Feature selection

In [77]:
merged_df_train = pd.read_pickle('merged_df_train.pkl')
merged_df_val = pd.read_pickle('merged_df_val.pkl')

In [78]:
def check_subst_list(list_str, check_str):
    for l in list_str:
        if l in check_str:
            return True
    return False

feature_classes = {'color':[x for x in merged_df_train.columns if check_subst_list(['bgr', 'hsv', 'YCrCb'],x)],
                   'glcm':[x for x in merged_df_train.columns if check_subst_list(['contrast', 'dissimilarity',
                                                                                   'homogeneity', 'energy',
                                                                                   'correlation', 'ASM'], x)], 
                   'lbp': [x for x in merged_df_train.columns if 'lbp' in x], 
                   'bow':[x for x in merged_df_train.columns if 'bow' in x]}

In [79]:
X_train = merged_df_train.drop(columns=['class'])
X_test = merged_df_val.drop(columns=['class'])
y_train = merged_df_train['class']
y_test = merged_df_val['class']

scaler = DescriptorsTransformer(None)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, X_test.shape)

Unnamed: 0_level_0,bgr_b_mean,bgr_b_std,bgr_b_skew,bgr_b_kurt,bgr_b_max,bgr_b_min,bgr_b_entrp,bgr_b_unq,bgr_g_mean,bgr_g_std,...,bow_90,bow_91,bow_92,bow_93,bow_94,bow_95,bow_96,bow_97,bow_98,bow_99
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mel03088_inpaint_0_5.png,162.672287,61.694851,-0.697348,-1.077998,250.0,19.0,11.006823,227,150.737137,62.352291,...,0.000000,0.000000,0.000000,0.064157,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
mel03261_inpaint_0_5.png,179.156250,30.963547,-1.417106,2.271164,226.0,18.0,9.712302,199,127.307678,43.122520,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.021851,0.0,0.162976,0.772517
mel03342_inpaint_0_5.png,146.851044,37.369217,0.167092,-0.346187,245.0,38.0,11.057418,204,134.243668,45.439552,...,0.000000,0.000000,0.000000,0.403046,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
mel03192_inpaint_0_5.png,205.485016,22.085478,-2.884731,8.592208,231.0,64.0,11.083804,154,193.508499,31.325283,...,0.000000,0.032401,0.038344,0.043096,0.000000,0.000000,0.031847,0.0,0.000000,0.128676
mel02715_inpaint_0_5.png,129.766785,23.655115,-0.514049,-0.549720,185.0,52.0,11.072948,130,70.831055,27.696409,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
scc00396_inpaint_0_5.png,210.440598,20.715895,-1.605261,2.332138,238.0,115.0,9.723982,120,168.585052,33.563663,...,0.203586,0.000000,0.027240,0.000000,0.763834,0.000000,0.022624,0.0,0.000000,0.022853
scc00465_inpaint_0_5.png,198.687119,37.127995,-2.469665,8.442849,247.0,1.0,11.067039,246,163.411026,35.686066,...,0.064951,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
scc00406_inpaint_0_5.png,196.194717,5.175140,-0.053817,-0.366602,228.0,169.0,11.090006,49,203.498489,6.349289,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.097542,0.000000,0.0,0.000000,0.000000
scc00386_inpaint_0_5.png,181.532272,14.199497,-4.223199,27.627628,225.0,42.0,11.086805,180,135.912949,17.570173,...,0.031518,0.000000,0.168686,0.000000,0.000000,0.000000,0.105079,0.0,0.000000,0.035381
