# Feature Extraction
## Importing Libraries

In [1]:
# import cv2 as cv
import SimpleITK as sitk

import numpy as np
import os
from skimage.feature.texture import greycomatrix
from skimage.feature.texture import greycoprops
from skimage.measure import shannon_entropy
from radiomics import glrlm, glcm
# import pyfeats
import pandas as pd
import multiprocessing as mlp
import math
import feature_extraction as fe

In [2]:
# from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

## Define Feature Extraction functions

### Read dataset images

In [3]:
def read_images(folder = "dataset/train",
                classes = [
                            "normal",
                            "fatty",
                            "cirrhosis"
                        ]):
    image_names = {}
    images = []
    # Get all image names in folders
    for cls in classes:
        image_names[cls] = os.listdir(f'{folder}/{cls}')

    # read all images to list
    for cls in classes:
        for name in image_names[cls]:
            mask = []
            with open(f'dataset/masks/{name[0:-4]}.txt', 'r') as file:
                data = file.read()
                data = data.strip().split('\n')
                for line in data:
                    x, y = line.split(',')
                    mask.append((int(y),int(x)))
            img = sitk.ReadImage(f'{folder}/{cls}/{name}', sitk.sitkUInt8)
            images.append((name, img,cls,mask))
    return images

### Extract ROIs from image

In [4]:
def extract_roi(img, start , size = (32,32)):
    img = sitk.GetArrayFromImage(img)
    roi = img[start[0]:start[0]+size[0],start[1]:start[1]+size[1]]
    mask = np.zeros(img.shape)
    mask[start[0]:start[0]+size[0],start[1]:start[1]+size[1]] = 1
    return roi, mask

### Extract Features from ROIs

In [5]:
def feature_extraction(img, roi_pos):
    roi_mask_arr = []
    for pos in roi_pos:
        roi_mask_arr.append(extract_roi(img, pos))
    
    # 0 45 90 135 degrees
    angles = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4]
    
    da_dict = {
        0: "d1_0",
        1: "d1_45",
        2: "d1_90",
        3: "d1_135",
        
        4: "d2_0",
        5: "d2_45",
        6: "d2_90",
        7: "d2_135",
        
        8: "d3_0",
        9: "d3_45",
        10: "d3_90",
        11: "d3_135",
        
    }
    
    feat_arr = []
    for roi, mask in roi_mask_arr:
        features = {}
        
        glcm_mtx = greycomatrix(roi, distances = [1,2,3], angles = angles, levels = 256)
        con = greycoprops(glcm_mtx, 'contrast').flatten()
        hom = greycoprops(glcm_mtx, 'homogeneity').flatten()
        en = greycoprops(glcm_mtx, 'energy').flatten()
        corr = greycoprops(glcm_mtx, 'correlation').flatten()
        
        for j in range(len(da_dict)):
            features[f'contrast_{da_dict[j]}'] = con[j]
            features[f'homogeneity_{da_dict[j]}'] = hom[j]
            features[f'energy_{da_dict[j]}'] = en[j]
            features[f'correlation_{da_dict[j]}'] = corr[j]
            
        features[f'entropy'] = shannon_entropy(roi)
        
        features[f'mean'] = np.mean(roi)
        features[f'variance'] = np.var(roi)
        
        # pyradiomics
        mask = sitk.GetImageFromArray(mask)
        # GLCM features
        glcmFeatures = glcm.RadiomicsGLCM(img, mask)
        glcmFeatures.enableAllFeatures()
        results = glcmFeatures.execute()
        for col in results.keys():
            features[col] = results[col].item()
        
        # GLRLM features
        glrlmFeatures = glrlm.RadiomicsGLRLM(img, mask)
        glrlmFeatures.enableAllFeatures()
        results = glrlmFeatures.execute()
        
        for col in results.keys():
            features[col] = results[col].item()
        
        feat_arr.append(features)
        
    return feat_arr

### Construct dataframe from ROI features

In [6]:
def build_dataframe(images):
    # dataframe consists of features of 1 ROI per image
    # column name roiNum_feature
    data = pd.DataFrame()

    for img, cls, mask in images:
        feat_arr = feature_extraction(img, roi_pos=mask)
        for row in feat_arr:
            row['target'] = cls
            data = data.append(row,ignore_index=True)
    return data

### Construct dataframe using multiprocessing
### Reduced runtime by 82%

In [7]:
def build_with_mlp(images, n=9): 
    pool = mlp.Pool(n)
    results = pool.map(fe.build_dataframe,np.array_split(images,n))
    return results

## Feature Analysis and Selection

### Extract Features and build dataframe

In [175]:
%%time

# images = read_images('dataset/train')
# mlp_data = build_with_mlp(images)
# data = pd.DataFrame()
# for frame in mlp_data:
#     data = data.append(frame)

# data.set_index('name', drop=True, inplace=True)

# data.to_csv("dataset/train.csv")

data = pd.read_csv('dataset/train.csv', index_col='name')

data.describe()

Wall time: 167 ms


Unnamed: 0,10Percentile,90Percentile,Autocorrelation,ClusterProminence,ClusterShade,ClusterTendency,Contrast,Correlation,DifferenceAverage,DifferenceEntropy,...,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90,mean,variance
count,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,...,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0
mean,49.607947,78.450133,5.265885,6.796472,0.534181,1.057417,0.273418,0.536117,0.254731,0.7737835,...,0.188472,0.151202,0.16981,0.118795,0.148841,0.117156,0.123361,0.113331,63.740117,159.733414
std,22.157389,27.822747,2.935076,18.132138,1.969909,0.89459,0.149987,0.148141,0.124245,0.2413614,...,0.084368,0.064099,0.074271,0.053416,0.068449,0.053321,0.055464,0.053904,24.273363,153.209079
min,0.0,6.0,1.0,0.0,-8.367119,0.0,0.0,-0.002694,0.0,-3.203427e-16,...,0.049637,0.048944,0.044883,0.03055,0.041362,0.035131,0.036691,0.031642,2.023438,3.118103
25%,34.0,58.7,3.249825,0.909022,-0.106256,0.569849,0.16631,0.437385,0.164957,0.6338971,...,0.121571,0.100899,0.115812,0.07516,0.093532,0.075949,0.081603,0.070858,46.347168,59.755928
50%,49.0,79.7,4.539973,2.138432,0.153262,0.833975,0.235635,0.525369,0.231164,0.7685916,...,0.18836,0.14682,0.161081,0.113689,0.147429,0.111746,0.117534,0.106542,64.274414,111.377326
75%,64.0,98.0,6.776047,5.294571,0.474892,1.276992,0.379435,0.634107,0.352453,0.9597778,...,0.233789,0.184354,0.205439,0.145168,0.183088,0.14291,0.149219,0.139533,80.636719,214.108139
max,132.0,170.0,21.50717,444.72062,34.921036,13.172388,0.877211,1.0,0.596986,1.343032,...,0.741631,0.701923,0.775769,0.67395,0.697514,0.620327,0.699876,0.623982,149.47168,2049.339576


In [176]:
%%time

# test_images = read_images("dataset/test")
# mlp_data = build_with_mlp(test_images)
# test_data = pd.DataFrame()
# for frame in mlp_data:
#     test_data = test_data.append(frame)
    
# test_data.set_index('name', drop=True, inplace=True)

# test_data.to_csv("dataset/test.csv")

test_data = pd.read_csv('dataset/test.csv', index_col='name')

test_data.describe()

Wall time: 128 ms


Unnamed: 0,10Percentile,90Percentile,Autocorrelation,ClusterProminence,ClusterShade,ClusterTendency,Contrast,Correlation,DifferenceAverage,DifferenceEntropy,...,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90,mean,variance
count,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,...,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0
mean,50.217277,79.27644,5.295707,7.244694,0.579809,1.082905,0.261253,0.554878,0.244751,0.7640925,...,0.185952,0.147653,0.176052,0.120195,0.146332,0.117783,0.125761,0.115112,64.354444,162.414246
std,22.434761,27.3413,2.85337,17.29401,2.246493,0.853788,0.137671,0.145262,0.113077,0.2203017,...,0.073204,0.056312,0.073037,0.051804,0.059865,0.051189,0.054049,0.051733,24.211294,143.825166
min,1.0,7.0,1.0,0.0,-3.508871,0.0,0.0,-0.000639,0.0,-3.203427e-16,...,0.063136,0.057985,0.055655,0.03174,0.046913,0.037576,0.039154,0.034577,3.886719,5.821152
25%,35.0,60.25,3.509715,0.894214,-0.104566,0.552214,0.167227,0.451109,0.164111,0.6380821,...,0.124603,0.108026,0.124869,0.082521,0.099154,0.079586,0.089632,0.077233,47.729248,62.612046
50%,50.0,80.0,4.72101,2.237011,0.208912,0.825854,0.227071,0.552495,0.223359,0.7549453,...,0.183858,0.14743,0.165739,0.115742,0.140288,0.111879,0.118271,0.107708,65.222656,112.27764
75%,62.0,97.0,6.85844,6.281702,0.529058,1.381738,0.343392,0.654871,0.322666,0.929092,...,0.23259,0.176334,0.213714,0.144507,0.179333,0.13675,0.148503,0.13559,80.237061,220.327254
max,128.0,150.0,16.2151,218.165215,27.204337,6.577641,0.748138,1.0,0.577532,1.28881,...,0.567898,0.533208,0.601422,0.451463,0.487847,0.414726,0.481178,0.408715,138.832031,974.315304


In [177]:
len(images)+len(test_images)

189

## Testing

In [178]:
def split(data, test_data, drop):
    X_train = data.copy()#.drop(['name'], axis=1)
    y_train = X_train.pop('target')
    X_test = test_data.copy()#.drop(['name'], axis=1)
    y_test = X_test.pop('target')

    X_train = X_train[y_train != drop]
    X_test = X_test[y_test != drop]

    y_train = y_train[y_train != drop]
    y_test = y_test[y_test != drop]

    std = StandardScaler()
    std.fit(X_train)
    X_train = pd.DataFrame(std.transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(std.transform(X_test), columns = X_test.columns, index = X_test.index)
    return X_train, y_train, X_test, y_test, std

In [179]:
def train_test(model, X_train, y_train, X_test, y_test):
    model = model.fit(X_train, y_train)
    y_pred = pd.Series(model.predict(X_test),index=y_test.index)
    return model, y_pred

In [180]:
def images_pred(y_pred):
    count = 0
    prediction = {}
    for name in np.unique(y_pred.index):
        pred_cls = {}
        for i in y_pred[name]:
            if i not in pred_cls.keys():
                pred_cls[i]=1
            else: pred_cls[i]+=1
        
        prediction[name] = max(pred_cls, key=pred_cls.get)
    return prediction

In [181]:
def images_acc(y_test, y_pred):
    pred_count = 0
    for key in y_pred.keys():
        if y_test[key][0] == y_pred[key]:
            pred_count += 1
    return pred_count/len(y_pred.keys())

In [182]:
models = {
    "RFC": RandomForestClassifier(
                    random_state=42,
                    max_features='auto',
                    n_estimators= 500,
                    max_depth=6,
                    criterion='entropy'),
    "MLP": MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001),
    "SVC": svm.SVC()
}
classes = ['normal', 'fatty', 'cirrhosis']

for drop in classes:
    X_train, y_train, X_test, y_test, std = split(data, test_data, drop)
    print(*[cls for cls in classes if cls != drop])
    for name in models.keys():
        model, y_pred = train_test(models[name], X_train, y_train, X_test, y_test)
        prediction = images_pred(y_pred)
        print(name," Image Accuracy: ", images_acc(y_test, prediction))
        report = classification_report(y_test, y_pred, output_dict = True)
        cr = pd.DataFrame(report).transpose()
        print(cr)
    print('\n\n')

fatty cirrhosis
RFC  Image Accuracy:  0.8636363636363636
              precision    recall  f1-score     support
cirrhosis      0.860465  0.474359  0.611570   78.000000
fatty          0.784211  0.961290  0.863768  155.000000
accuracy       0.798283  0.798283  0.798283    0.798283
macro avg      0.822338  0.717825  0.737669  233.000000
weighted avg   0.809738  0.798283  0.779341  233.000000
MLP  Image Accuracy:  0.8636363636363636
              precision    recall  f1-score     support
cirrhosis      0.724138  0.538462  0.617647   78.000000
fatty          0.794286  0.896774  0.842424  155.000000
accuracy       0.776824  0.776824  0.776824    0.776824
macro avg      0.759212  0.717618  0.730036  233.000000
weighted avg   0.770803  0.776824  0.767177  233.000000
SVC  Image Accuracy:  0.8636363636363636
              precision    recall  f1-score   support
cirrhosis      0.795918  0.500000  0.614173   78.0000
fatty          0.788043  0.935484  0.855457  155.0000
accuracy       0.789700  0.

In [285]:
normal_fatty_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001)
X_train, y_train, X_test, y_test, normal_fatty_std = split(data, test_data, 'cirrhosis')
model, y_pred = train_test(normal_fatty_mlp, X_train, y_train, X_test, y_test)
normal_fatty_mlp = model
prediction = images_pred(y_pred)
print("Normal/Fatty MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

Normal/Fatty MLP Image Accuracy:  0.8275862068965517
              precision    recall  f1-score     support
fatty          0.697531  0.729032  0.712934  155.000000
normal         0.704225  0.671141  0.687285  149.000000
accuracy       0.700658  0.700658  0.700658    0.700658
macro avg      0.700878  0.700087  0.700109  304.000000
weighted avg   0.700812  0.700658  0.700363  304.000000


In [273]:
normal_cirrhosis_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001)
X_train, y_train, X_test, y_test, normal_cirrhosis_std = split(data, test_data, 'fatty')
model, y_pred = train_test(normal_cirrhosis_mlp, X_train, y_train, X_test, y_test)
normal_cirrhosis_mlp = model
prediction = images_pred(y_pred)
print("normal/cirrhosis MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

normal/cirrhosis MLP Image Accuracy:  0.7894736842105263
              precision    recall  f1-score     support
cirrhosis      0.619048  0.500000  0.553191   78.000000
normal         0.762195  0.838926  0.798722  149.000000
accuracy       0.722467  0.722467  0.722467    0.722467
macro avg      0.690621  0.669463  0.675957  227.000000
weighted avg   0.713008  0.722467  0.714355  227.000000


In [272]:
fatty_cirrhosis_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001)
X_train, y_train, X_test, y_test, fatty_cirrhosis_std = split(data, test_data, 'normal')
model, y_pred = train_test(fatty_cirrhosis_mlp, X_train, y_train, X_test, y_test)
fatty_cirrhosis_mlp = model
prediction = images_pred(y_pred)
print("cirrhosis/Fatty MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

cirrhosis/Fatty MLP Image Accuracy:  0.9090909090909091
              precision    recall  f1-score     support
cirrhosis      0.765625  0.628205  0.690141   78.000000
fatty          0.828402  0.903226  0.864198  155.000000
accuracy       0.811159  0.811159  0.811159    0.811159
macro avg      0.797014  0.765715  0.777169  233.000000
weighted avg   0.807387  0.811159  0.805930  233.000000


In [286]:
models = {
    "normal_fatty": (normal_fatty_mlp, normal_fatty_std),
    "normal_cirrhosis": (normal_cirrhosis_mlp, normal_cirrhosis_std),
    "fatty_cirrhosis": (fatty_cirrhosis_mlp, fatty_cirrhosis_std)
}

X_test = test_data.copy()
y_test = X_test.pop('target')

std = StandardScaler()
std.fit(X_train)
X_test = pd.DataFrame(std.transform(X_test), columns = X_test.columns, index = X_test.index)

In [287]:
pd.set_option('display.max_rows', None)
predictions = {}
for name in models.keys():
    X_test = test_data.copy()
    y_test = X_test.pop('target')
    X_test = pd.DataFrame(models[name][1].transform(X_test), columns = X_test.columns, index = X_test.index)
    y_pred = pd.Series(models[name][0].predict(X_test),index=y_test.index)
    predictions[name] = images_pred(y_pred)
    
image_names = np.unique(y_test.index)

In [288]:
image_name = np.unique(y_test.index)
final_pred = {}
for image in image_name:
    pred = {
        'normal': 0,
        'fatty': 0,
        'cirrhosis': 0
    }
    for model in predictions.keys():
        pred[predictions[model][image]] += 1
    cls = max(pred, key=pred.get)
    if pred[cls] == 1:
        final_pred[image] = 'abstain'
    else: final_pred[image] = cls

In [289]:
images_acc(y_test, final_pred)

0.7142857142857143

In [290]:
y_test = y_test[~y_test.index.duplicated(keep='first')].sort_index()
y_pred = pd.Series(final_pred).sort_index()
abstain = y_pred[y_pred=='abstain'].index

y_test = y_test.drop(abstain)
y_pred = y_pred.drop(abstain)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)
print("Abstention Rate: ", len(abstain)/(len(y_pred)+len(abstain)))

              precision    recall  f1-score    support
cirrhosis      0.666667  0.400000  0.500000   5.000000
fatty          0.789474  0.937500  0.857143  16.000000
normal         0.666667  0.615385  0.640000  13.000000
accuracy       0.735294  0.735294  0.735294   0.735294
macro avg      0.707602  0.650962  0.665714  34.000000
weighted avg   0.724458  0.735294  0.721597  34.000000
Abstention Rate:  0.02857142857142857
