# Feature Extraction
## Importing Libraries

In [27]:
# import cv2 as cv
import SimpleITK as sitk

import numpy as np
import os
from skimage.feature.texture import greycomatrix
from skimage.feature.texture import greycoprops
from skimage.measure import shannon_entropy
from radiomics import glrlm, glcm
# import pyfeats
import pandas as pd
import multiprocessing as mlp
import math
import feature_extraction as fe

In [28]:
# from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

## Define Feature Extraction functions

### Read dataset images

In [3]:
def read_images(folder = "dataset/train",
                classes = [
                            "normal",
                            "fatty",
                            "cirrhosis"
                        ]):
    image_names = {}
    images = []
    # Get all image names in folders
    for cls in classes:
        image_names[cls] = os.listdir(f'{folder}/{cls}')

    # read all images to list
    for cls in classes:
        for name in image_names[cls]:
            mask = []
            with open(f'dataset/masks/{name[0:-4]}.txt', 'r') as file:
                data = file.read()
                data = data.strip().split('\n')
                for line in data:
                    x, y = line.split(',')
                    mask.append((int(y),int(x)))
            img = sitk.ReadImage(f'{folder}/{cls}/{name}', sitk.sitkUInt8)
            images.append((name, img,cls,mask))
    return images

### Extract ROIs from image

In [4]:
def extract_roi(img, start , size = (32,32)):
    img = sitk.GetArrayFromImage(img)
    roi = img[start[0]:start[0]+size[0],start[1]:start[1]+size[1]]
    mask = np.zeros(img.shape)
    mask[start[0]:start[0]+size[0],start[1]:start[1]+size[1]] = 1
    return roi, mask

### Extract Features from ROIs

In [5]:
def feature_extraction(img, roi_pos):
    roi_mask_arr = []
    for pos in roi_pos:
        roi_mask_arr.append(extract_roi(img, pos))
    
    # 0 45 90 135 degrees
    angles = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4]
    
    da_dict = {
        0: "d1_0",
        1: "d1_45",
        2: "d1_90",
        3: "d1_135",
        
        4: "d2_0",
        5: "d2_45",
        6: "d2_90",
        7: "d2_135",
        
        8: "d3_0",
        9: "d3_45",
        10: "d3_90",
        11: "d3_135",
        
    }
    
    feat_arr = []
    for roi, mask in roi_mask_arr:
        features = {}
        
        glcm_mtx = greycomatrix(roi, distances = [1,2,3], angles = angles, levels = 256)
        con = greycoprops(glcm_mtx, 'contrast').flatten()
        hom = greycoprops(glcm_mtx, 'homogeneity').flatten()
        en = greycoprops(glcm_mtx, 'energy').flatten()
        corr = greycoprops(glcm_mtx, 'correlation').flatten()
        
        for j in range(len(da_dict)):
            features[f'contrast_{da_dict[j]}'] = con[j]
            features[f'homogeneity_{da_dict[j]}'] = hom[j]
            features[f'energy_{da_dict[j]}'] = en[j]
            features[f'correlation_{da_dict[j]}'] = corr[j]
            
        features[f'entropy'] = shannon_entropy(roi)
        
        features[f'mean'] = np.mean(roi)
        features[f'variance'] = np.var(roi)
        
        # pyradiomics
        mask = sitk.GetImageFromArray(mask)
        # GLCM features
        glcmFeatures = glcm.RadiomicsGLCM(img, mask)
        glcmFeatures.enableAllFeatures()
        results = glcmFeatures.execute()
        for col in results.keys():
            features[col] = results[col].item()
        
        # GLRLM features
        glrlmFeatures = glrlm.RadiomicsGLRLM(img, mask)
        glrlmFeatures.enableAllFeatures()
        results = glrlmFeatures.execute()
        
        for col in results.keys():
            features[col] = results[col].item()
        
        feat_arr.append(features)
        
    return feat_arr

### Construct dataframe from ROI features

In [6]:
def build_dataframe(images):
    # dataframe consists of features of 1 ROI per image
    # column name roiNum_feature
    data = pd.DataFrame()

    for img, cls, mask in images:
        feat_arr = feature_extraction(img, roi_pos=mask)
        for row in feat_arr:
            row['target'] = cls
            data = data.append(row,ignore_index=True)
    return data

### Construct dataframe using multiprocessing
### Reduced runtime by 82%

In [7]:
def build_with_mlp(images, n=9): 
    pool = mlp.Pool(n)
    results = pool.map(fe.build_dataframe,np.array_split(images,n))
    return results

## Feature Analysis and Selection

### Extract Features and build dataframe

In [29]:
%%time

# images = read_images('dataset/train')
# mlp_data = build_with_mlp(images)
# data = pd.DataFrame()
# for frame in mlp_data:
#     data = data.append(frame)

# data.set_index('name', drop=True, inplace=True)

# data.to_csv("dataset/train.csv")

data = pd.read_csv('dataset/train.csv', index_col='name')

data.describe()

Wall time: 209 ms


Unnamed: 0,10Percentile,90Percentile,Autocorrelation,ClusterProminence,ClusterShade,ClusterTendency,Contrast,Correlation,DifferenceAverage,DifferenceEntropy,...,homogeneity_d1_45,homogeneity_d1_90,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90
count,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,...,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0
mean,49.607947,78.450133,5.265885,6.796472,0.534181,1.057417,0.273418,0.536117,0.254731,0.7737835,...,0.16981,0.163014,0.188472,0.151202,0.16981,0.118795,0.148841,0.117156,0.123361,0.113331
std,22.157389,27.822747,2.935076,18.132138,1.969909,0.89459,0.149987,0.148141,0.124245,0.2413614,...,0.074271,0.067913,0.084368,0.064099,0.074271,0.053416,0.068449,0.053321,0.055464,0.053904
min,0.0,6.0,1.0,0.0,-8.367119,0.0,0.0,-0.002694,0.0,-3.203427e-16,...,0.044883,0.055994,0.049637,0.048944,0.044883,0.03055,0.041362,0.035131,0.036691,0.031642
25%,34.0,58.7,3.249825,0.909022,-0.106256,0.569849,0.16631,0.437385,0.164957,0.6338971,...,0.115812,0.105677,0.121571,0.100899,0.115812,0.07516,0.093532,0.075949,0.081603,0.070858
50%,49.0,79.7,4.539973,2.138432,0.153262,0.833975,0.235635,0.525369,0.231164,0.7685916,...,0.161081,0.158151,0.18836,0.14682,0.161081,0.113689,0.147429,0.111746,0.117534,0.106542
75%,64.0,98.0,6.776047,5.294571,0.474892,1.276992,0.379435,0.634107,0.352453,0.9597778,...,0.205439,0.196115,0.233789,0.184354,0.205439,0.145168,0.183088,0.14291,0.149219,0.139533
max,132.0,170.0,21.50717,444.72062,34.921036,13.172388,0.877211,1.0,0.596986,1.343032,...,0.775769,0.74304,0.741631,0.701923,0.775769,0.67395,0.697514,0.620327,0.699876,0.623982


In [30]:
%%time

# test_images = read_images("dataset/test")
# mlp_data = build_with_mlp(test_images)
# test_data = pd.DataFrame()
# for frame in mlp_data:
#     test_data = test_data.append(frame)
    
# test_data.set_index('name', drop=True, inplace=True)

# test_data.to_csv("dataset/test.csv")

test_data = pd.read_csv('dataset/test.csv', index_col='name')

test_data.describe()

Wall time: 134 ms


Unnamed: 0,10Percentile,90Percentile,Autocorrelation,ClusterProminence,ClusterShade,ClusterTendency,Contrast,Correlation,DifferenceAverage,DifferenceEntropy,...,homogeneity_d1_45,homogeneity_d1_90,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90
count,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,...,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0
mean,50.217277,79.27644,5.295707,7.244694,0.579809,1.082905,0.261253,0.554878,0.244751,0.7640925,...,0.176052,0.164242,0.185952,0.147653,0.176052,0.120195,0.146332,0.117783,0.125761,0.115112
std,22.434761,27.3413,2.85337,17.29401,2.246493,0.853788,0.137671,0.145262,0.113077,0.2203017,...,0.073037,0.064448,0.073204,0.056312,0.073037,0.051804,0.059865,0.051189,0.054049,0.051733
min,1.0,7.0,1.0,0.0,-3.508871,0.0,0.0,-0.000639,0.0,-3.203427e-16,...,0.055655,0.066152,0.063136,0.057985,0.055655,0.03174,0.046913,0.037576,0.039154,0.034577
25%,35.0,60.25,3.509715,0.894214,-0.104566,0.552214,0.167227,0.451109,0.164111,0.6380821,...,0.124869,0.114571,0.124603,0.108026,0.124869,0.082521,0.099154,0.079586,0.089632,0.077233
50%,50.0,80.0,4.72101,2.237011,0.208912,0.825854,0.227071,0.552495,0.223359,0.7549453,...,0.165739,0.161493,0.183858,0.14743,0.165739,0.115742,0.140288,0.111879,0.118271,0.107708
75%,62.0,97.0,6.85844,6.281702,0.529058,1.381738,0.343392,0.654871,0.322666,0.929092,...,0.213714,0.198065,0.23259,0.176334,0.213714,0.144507,0.179333,0.13675,0.148503,0.13559
max,128.0,150.0,16.2151,218.165215,27.204337,6.577641,0.748138,1.0,0.577532,1.28881,...,0.601422,0.594621,0.567898,0.533208,0.601422,0.451463,0.487847,0.414726,0.481178,0.408715


## Testing

In [31]:
def split(data, test_data, drop):
    X_train = data.copy()#.drop(['name'], axis=1)
    y_train = X_train.pop('target')
    X_test = test_data.copy()#.drop(['name'], axis=1)
    y_test = X_test.pop('target')

    X_train = X_train[y_train != drop]
    X_test = X_test[y_test != drop]

    y_train = y_train[y_train != drop]
    y_test = y_test[y_test != drop]

    std = StandardScaler()
    std.fit(X_train)
    X_train = pd.DataFrame(std.transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(std.transform(X_test), columns = X_test.columns, index = X_test.index)
    return X_train, y_train, X_test, y_test, std

In [32]:
def train_test(model, X_train, y_train, X_test, y_test):
    model = model.fit(X_train, y_train)
    y_pred = pd.Series(model.predict(X_test),index=y_test.index)
    return model, y_pred

In [33]:
def images_pred(y_pred):
    count = 0
    prediction = {}
    for name in np.unique(y_pred.index):
        pred_cls = {}
        for i in y_pred[name]:
            if i not in pred_cls.keys():
                pred_cls[i]=1
            else: pred_cls[i]+=1
        
        prediction[name] = max(pred_cls, key=pred_cls.get)
    return prediction

In [34]:
def images_acc(y_test, y_pred):
    pred_count = 0
    for key in y_pred.keys():
        if y_test[key][0] == y_pred[key]:
            pred_count += 1
    return pred_count/len(y_pred.keys())

In [18]:
models = {
    "RFC": RandomForestClassifier(
                    random_state=42,
                    max_features='auto',
                    n_estimators= 500,
                    max_depth=6,
                    criterion='entropy'),
    "MLP": MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001),
    "SVC": svm.SVC()
}
classes = ['normal', 'fatty', 'cirrhosis']

for drop in classes:
    X_train, y_train, X_test, y_test, std = split(data, test_data, drop)
    print(*[cls for cls in classes if cls != drop])
    for name in models.keys():
        model, y_pred = train_test(models[name], X_train, y_train, X_test, y_test)
        prediction = images_pred(y_pred)
        print(name," Image Accuracy: ", images_acc(y_test, prediction))
        report = classification_report(y_test, y_pred, output_dict = True)
        cr = pd.DataFrame(report).transpose()
        print(cr)
    print('\n\n')

fatty cirrhosis
RFC  Image Accuracy:  0.8181818181818182
              precision    recall  f1-score     support
cirrhosis      0.857143  0.461538  0.600000   78.000000
fatty          0.780105  0.961290  0.861272  155.000000
accuracy       0.793991  0.793991  0.793991    0.793991
macro avg      0.818624  0.711414  0.730636  233.000000
weighted avg   0.805894  0.793991  0.773807  233.000000
MLP  Image Accuracy:  0.8636363636363636
              precision    recall  f1-score     support
cirrhosis      0.704918  0.551282  0.618705   78.000000
fatty          0.796512  0.883871  0.837920  155.000000
accuracy       0.772532  0.772532  0.772532    0.772532
macro avg      0.750715  0.717577  0.728313  233.000000
weighted avg   0.765849  0.772532  0.764535  233.000000
SVC  Image Accuracy:  0.8636363636363636
              precision    recall  f1-score     support
cirrhosis      0.812500  0.500000  0.619048   78.000000
fatty          0.789189  0.941935  0.858824  155.000000
accuracy       0.7939

In [38]:
# feat importance
files = ['fatty_normal', 'cirrhosis_fatty', 'cirrhosis_normal']
features_acc={}
for name in files:
    features_acc[name] = pd.read_csv(f'dataset/manual selection/{name}.csv', index_col = 0)

In [199]:
normal_fatty_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001)

feat_imp = features_acc['fatty_normal']['ANN Accuracy'].sort_values(ascending=False)
normal_fatty_cols = feat_imp.index[0:21]

X_train, y_train, X_test, y_test, normal_fatty_std = split(data, test_data, 'cirrhosis')
model, y_pred = train_test(normal_fatty_mlp, X_train[normal_fatty_cols], y_train, X_test[normal_fatty_cols], y_test)
normal_fatty_mlp = model
prediction = images_pred(y_pred)
print("Normal/Fatty MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

Normal/Fatty MLP Image Accuracy:  0.8275862068965517
              precision    recall  f1-score     support
fatty          0.716763  0.800000  0.756098  155.000000
normal         0.763359  0.671141  0.714286  149.000000
accuracy       0.736842  0.736842  0.736842    0.736842
macro avg      0.740061  0.735570  0.735192  304.000000
weighted avg   0.739601  0.736842  0.735604  304.000000


In [208]:
normal_cirrhosis_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001)

feat_imp = features_acc['cirrhosis_normal']['ANN Accuracy'].sort_values(ascending=False)
normal_cirrhosis_cols = feat_imp.index[0:6]

X_train, y_train, X_test, y_test, normal_cirrhosis_std = split(data, test_data, 'fatty')
model, y_pred = train_test(normal_cirrhosis_mlp, X_train[normal_cirrhosis_cols], y_train, X_test[normal_cirrhosis_cols], y_test)
normal_cirrhosis_mlp = model
prediction = images_pred(y_pred)
print("normal/cirrhosis MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

normal/cirrhosis MLP Image Accuracy:  0.7894736842105263
              precision    recall  f1-score     support
cirrhosis      0.933333  0.358974  0.518519   78.000000
normal         0.746193  0.986577  0.849711  149.000000
accuracy       0.770925  0.770925  0.770925    0.770925
macro avg      0.839763  0.672776  0.684115  227.000000
weighted avg   0.810497  0.770925  0.735909  227.000000


In [187]:
fatty_cirrhosis_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001)

feat_imp = features_acc['cirrhosis_fatty']['ANN Accuracy'].sort_values(ascending=False)
fatty_cirrhosis_cols = feat_imp.index[0:82]

X_train, y_train, X_test, y_test, fatty_cirrhosis_std = split(data, test_data, 'normal')
model, y_pred = train_test(fatty_cirrhosis_mlp, X_train[fatty_cirrhosis_cols], y_train, X_test[fatty_cirrhosis_cols], y_test)
fatty_cirrhosis_mlp = model
prediction = images_pred(y_pred)
print("cirrhosis/Fatty MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

cirrhosis/Fatty MLP Image Accuracy:  0.9545454545454546
              precision    recall  f1-score     support
cirrhosis      0.784615  0.653846  0.713287   78.000000
fatty          0.839286  0.909677  0.873065  155.000000
accuracy       0.824034  0.824034  0.824034    0.824034
macro avg      0.811951  0.781762  0.793176  233.000000
weighted avg   0.820984  0.824034  0.819577  233.000000


In [209]:
models = {
    "normal_fatty": (normal_fatty_mlp, normal_fatty_std, normal_fatty_cols),
    "normal_cirrhosis": (normal_cirrhosis_mlp, normal_cirrhosis_std, normal_cirrhosis_cols),
    "fatty_cirrhosis": (fatty_cirrhosis_mlp, fatty_cirrhosis_std, fatty_cirrhosis_cols)
}

X_test = test_data.copy()
y_test = X_test.pop('target')

std = StandardScaler()
std.fit(X_train)
X_test = pd.DataFrame(std.transform(X_test), columns = X_test.columns, index = X_test.index)

In [210]:
pd.set_option('display.max_rows', None)
predictions = {}
for name in models.keys():
    X_test = test_data.copy()
    y_test = X_test.pop('target')
    X_test = pd.DataFrame(models[name][1].transform(X_test), columns = X_test.columns, index = X_test.index)
    X_test =  X_test[models[name][2]]
    y_pred = pd.Series(models[name][0].predict(X_test),index=y_test.index)
    predictions[name] = images_pred(y_pred)
    
image_names = np.unique(y_test.index)

In [211]:
image_name = np.unique(y_test.index)
final_pred = {}
for image in image_name:
    pred = {
        'normal': 0,
        'fatty': 0,
        'cirrhosis': 0
    }
    for model in predictions.keys():
        pred[predictions[model][image]] += 1
    cls = max(pred, key=pred.get)
    if pred[cls] == 1:
        final_pred[image] = 'abstain'
    else: final_pred[image] = cls

In [212]:
images_acc(y_test, final_pred)

0.7428571428571429

In [213]:
y_test = y_test[~y_test.index.duplicated(keep='first')].sort_index()
y_pred = pd.Series(final_pred).sort_index()
abstain = y_pred[y_pred=='abstain'].index

y_test = y_test.drop(abstain)
y_pred = y_pred.drop(abstain)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)
print("Abstention Rate: ", len(abstain)/(len(y_pred)+len(abstain)))

              precision    recall  f1-score    support
cirrhosis      1.000000  0.400000  0.571429   5.000000
fatty          0.789474  0.937500  0.857143  16.000000
normal         0.692308  0.692308  0.692308  13.000000
accuracy       0.764706  0.764706  0.764706   0.764706
macro avg      0.827260  0.676603  0.706960  34.000000
weighted avg   0.783282  0.764706  0.752101  34.000000
Abstention Rate:  0.02857142857142857
