# Feature Extraction
## Importing Libraries

In [1]:
# import cv2 as cv
import SimpleITK as sitk

import numpy as np
import os
from skimage.feature.texture import greycomatrix
from skimage.feature.texture import greycoprops
from skimage.measure import shannon_entropy
from radiomics import glrlm, glcm
# import pyfeats
import pandas as pd
import multiprocessing as mlp
import math
import feature_extraction as fe

In [2]:
# from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

## Define Feature Extraction functions

### Read dataset images

In [3]:
def read_images(folder = "dataset/train",
                classes = [
                            "normal",
                            "fatty",
                            "cirrhosis"
                        ]):
    image_names = {}
    images = []
    # Get all image names in folders
    for cls in classes:
        image_names[cls] = os.listdir(f'{folder}/{cls}')

    # read all images to list
    for cls in classes:
        for name in image_names[cls]:
            mask = []
            with open(f'dataset/masks/{name[0:-4]}.txt', 'r') as file:
                data = file.read()
                data = data.strip().split('\n')
                for line in data:
                    x, y = line.split(',')
                    mask.append((int(y),int(x)))
            img = sitk.ReadImage(f'{folder}/{cls}/{name}', sitk.sitkUInt8)
            images.append((name, img,cls,mask))
    return images

### Extract ROIs from image

In [4]:
def extract_roi(img, start , size = (32,32)):
    img = sitk.GetArrayFromImage(img)
    roi = img[start[0]:start[0]+size[0],start[1]:start[1]+size[1]]
    mask = np.zeros(img.shape)
    mask[start[0]:start[0]+size[0],start[1]:start[1]+size[1]] = 1
    return roi, mask

# Calculate Liver Diagonal Length

In [5]:
def get_length(img, mask):
    # top right, bottom left
    tr_distance = []
    bl_distance = []

    # top left, bottom right
    tl_distance = []
    br_distance = []


    for x,y in mask:
        tr_distance.append(math.dist([0,img.shape[1]],[x+32,y]))
        bl_distance.append(math.dist([img.shape[0],0],[x,y+32]))

        tl_distance.append(math.dist([0,0],[x,y]))
        br_distance.append(math.dist(img.shape,[x+32,y+32]))


    top_right = mask[tr_distance.index(min(tr_distance))]
    bottom_left = mask[bl_distance.index(min(bl_distance))]

    top_left = mask[tl_distance.index(min(tl_distance))]
    bottom_right = mask[br_distance.index(min(br_distance))]
    
    return max(math.dist(top_right,bottom_left),math.dist(top_left,bottom_right))

### Extract Features from ROIs

In [6]:
def feature_extraction(img, roi_pos):
    roi_mask_arr = []
    for pos in roi_pos:
        roi_mask_arr.append(extract_roi(img, pos))
    
    # 0 45 90 135 degrees
    angles = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4]
    
    da_dict = {
        0: "d1_0",
        1: "d1_45",
        2: "d1_90",
        3: "d1_135",
        
        4: "d2_0",
        5: "d2_45",
        6: "d2_90",
        7: "d2_135",
        
        8: "d3_0",
        9: "d3_45",
        10: "d3_90",
        11: "d3_135",
        
    }
    
    feat_arr = []
    for roi, mask in roi_mask_arr:
        features = {}
        
        glcm_mtx = greycomatrix(roi, distances = [1,2,3], angles = angles, levels = 256)
        con = greycoprops(glcm_mtx, 'contrast').flatten()
        hom = greycoprops(glcm_mtx, 'homogeneity').flatten()
        en = greycoprops(glcm_mtx, 'energy').flatten()
        corr = greycoprops(glcm_mtx, 'correlation').flatten()
        
        for j in range(len(da_dict)):
            features[f'contrast_{da_dict[j]}'] = con[j]
            features[f'homogeneity_{da_dict[j]}'] = hom[j]
            features[f'energy_{da_dict[j]}'] = en[j]
            features[f'correlation_{da_dict[j]}'] = corr[j]
            
        features[f'entropy'] = shannon_entropy(roi)
        
        features[f'mean'] = np.mean(roi)
        features[f'variance'] = np.var(roi)
        
        # pyradiomics
        mask = sitk.GetImageFromArray(mask)
        # GLCM features
        glcmFeatures = glcm.RadiomicsGLCM(img, mask)
        glcmFeatures.enableAllFeatures()
        results = glcmFeatures.execute()
        for col in results.keys():
            features[col] = results[col].item()
        
        # GLRLM features
        glrlmFeatures = glrlm.RadiomicsGLRLM(img, mask)
        glrlmFeatures.enableAllFeatures()
        results = glrlmFeatures.execute()
        
        for col in results.keys():
            features[col] = results[col].item()
        
        feat_arr.append(features)
        
    return feat_arr

### Construct dataframe from ROI features

In [7]:
def build_dataframe(images):
    # dataframe consists of features of 1 ROI per image
    # column name roiNum_feature
    data = pd.DataFrame()

    for img, cls, mask in images:
        feat_arr = feature_extraction(img, roi_pos=mask)
        for row in feat_arr:
            row['target'] = cls
            data = data.append(row,ignore_index=True)
    return data

### Construct dataframe using multiprocessing
### Reduced runtime by 82%

In [8]:
def build_with_mlp(images, n=9): 
    pool = mlp.Pool(n)
    results = pool.map(fe.build_dataframe,np.array_split(images,n))
    return results

## Feature Analysis and Selection

### Extract Features and build dataframe

In [9]:
%%time

images = read_images('dataset/train')
mlp_data = build_with_mlp(images)
data = pd.DataFrame()
for frame in mlp_data:
    data = data.append(frame)

data.set_index('name', drop=True, inplace=True)

# data.to_csv("dataset/train.csv")

# data = pd.read_csv('dataset/train.csv', index_col='name')

data.describe()

  return array(a, dtype, copy=False, order=order)


Wall time: 1min 29s


Unnamed: 0,10Percentile,90Percentile,Autocorrelation,ClusterProminence,ClusterShade,ClusterTendency,Contrast,Correlation,DifferenceAverage,DifferenceEntropy,...,homogeneity_d1_90,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90,length
count,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,...,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0
mean,49.607947,78.450133,5.265885,6.796472,0.534181,1.057417,0.273418,0.536117,0.254731,0.7737835,...,0.163014,0.188472,0.151202,0.16981,0.118795,0.148841,0.117156,0.123361,0.113331,230.081625
std,22.157389,27.822747,2.935076,18.132138,1.969909,0.89459,0.149987,0.148141,0.124245,0.2413614,...,0.067913,0.084368,0.064099,0.074271,0.053416,0.068449,0.053321,0.055464,0.053904,69.707894
min,0.0,6.0,1.0,0.0,-8.367119,0.0,0.0,-0.002694,0.0,-3.203427e-16,...,0.055994,0.049637,0.048944,0.044883,0.03055,0.041362,0.035131,0.036691,0.031642,74.094534
25%,34.0,58.7,3.249825,0.909022,-0.106256,0.569849,0.16631,0.437385,0.164957,0.6338971,...,0.105677,0.121571,0.100899,0.115812,0.07516,0.093532,0.075949,0.081603,0.070858,180.984724
50%,49.0,79.7,4.539973,2.138432,0.153262,0.833975,0.235635,0.525369,0.231164,0.7685916,...,0.158151,0.18836,0.14682,0.161081,0.113689,0.147429,0.111746,0.117534,0.106542,222.24761
75%,64.0,98.0,6.776047,5.294571,0.474892,1.276992,0.379435,0.634107,0.352453,0.9597778,...,0.196115,0.233789,0.184354,0.205439,0.145168,0.183088,0.14291,0.149219,0.139533,267.824569
max,132.0,170.0,21.50717,444.72062,34.921036,13.172388,0.877211,1.0,0.596986,1.343032,...,0.74304,0.741631,0.701923,0.775769,0.67395,0.697514,0.620327,0.699876,0.623982,586.168918


In [12]:
data['length'].to_csv('length.csv')

In [10]:
%%time

test_images = read_images("dataset/test")
mlp_data = build_with_mlp(test_images)
test_data = pd.DataFrame()
for frame in mlp_data:
    test_data = test_data.append(frame)
    
test_data.set_index('name', drop=True, inplace=True)

# test_data.to_csv("dataset/test.csv")

# test_data = pd.read_csv('dataset/test.csv', index_col='name')

test_data.describe()

  return array(a, dtype, copy=False, order=order)


Wall time: 19.3 s


Unnamed: 0,10Percentile,90Percentile,Autocorrelation,ClusterProminence,ClusterShade,ClusterTendency,Contrast,Correlation,DifferenceAverage,DifferenceEntropy,...,homogeneity_d1_90,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90,length
count,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,...,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0
mean,50.217277,79.27644,5.295707,7.244694,0.579809,1.082905,0.261253,0.554878,0.244751,0.7640925,...,0.164242,0.185952,0.147653,0.176052,0.120195,0.146332,0.117783,0.125761,0.115112,216.820374
std,22.434761,27.3413,2.85337,17.29401,2.246493,0.853788,0.137671,0.145262,0.113077,0.2203017,...,0.064448,0.073204,0.056312,0.073037,0.051804,0.059865,0.051189,0.054049,0.051733,39.763951
min,1.0,7.0,1.0,0.0,-3.508871,0.0,0.0,-0.000639,0.0,-3.203427e-16,...,0.066152,0.063136,0.057985,0.055655,0.03174,0.046913,0.037576,0.039154,0.034577,134.082064
25%,35.0,60.25,3.509715,0.894214,-0.104566,0.552214,0.167227,0.451109,0.164111,0.6380821,...,0.114571,0.124603,0.108026,0.124869,0.082521,0.099154,0.079586,0.089632,0.077233,189.834138
50%,50.0,80.0,4.72101,2.237011,0.208912,0.825854,0.227071,0.552495,0.223359,0.7549453,...,0.161493,0.183858,0.14743,0.165739,0.115742,0.140288,0.111879,0.118271,0.107708,218.593115
75%,62.0,97.0,6.85844,6.281702,0.529058,1.381738,0.343392,0.654871,0.322666,0.929092,...,0.198065,0.23259,0.176334,0.213714,0.144507,0.179333,0.13675,0.148503,0.13559,238.947693
max,128.0,150.0,16.2151,218.165215,27.204337,6.577641,0.748138,1.0,0.577532,1.28881,...,0.594621,0.567898,0.533208,0.601422,0.451463,0.487847,0.414726,0.481178,0.408715,328.805414


## Testing

In [13]:
def split(data, test_data, drop, cols = None):
    X_train = data.copy()
    y_train = X_train.pop('target')
    X_test = test_data.copy()
    y_test = X_test.pop('target')

    X_train = X_train[y_train != drop]
    X_test = X_test[y_test != drop]

    y_train = y_train[y_train != drop]
    y_test = y_test[y_test != drop]
    
    if cols is None: cols = X_train.columns
    
    std = StandardScaler()
    std.fit(X_train[cols])
    X_train = pd.DataFrame(std.transform(X_train[cols]), columns = cols, index = X_train.index)
    X_test = pd.DataFrame(std.transform(X_test[cols]), columns = cols, index = X_test.index)
    return X_train, y_train, X_test, y_test, std

In [14]:
def train_test(model, X_train, y_train, X_test, y_test):
    model = model.fit(X_train, y_train)
    y_pred = pd.Series(model.predict(X_test),index=y_test.index)
    return model, y_pred

In [15]:
def images_pred(y_pred):
    count = 0
    prediction = {}
    for name in np.unique(y_pred.index):
        pred_cls = {}
        for i in y_pred[name]:
            if i not in pred_cls.keys():
                pred_cls[i]=1
            else: pred_cls[i]+=1
        
        prediction[name] = max(pred_cls, key=pred_cls.get)
    return prediction

In [16]:
def images_acc(y_test, y_pred):
    pred_count = 0
    for key in y_pred.keys():
        if y_test[key][0] == y_pred[key]:
            pred_count += 1
    return pred_count/len(y_pred.keys())

In [17]:
models = {
    "RFC": RandomForestClassifier(
                    random_state=42,
                    max_features='auto',
                    n_estimators= 500,
                    max_depth=6,
                    criterion='entropy'),
    "MLP": MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=42),
    "SVC": svm.SVC(random_state=42)
}
classes = ['normal', 'fatty', 'cirrhosis']

for drop in classes:
    X_train, y_train, X_test, y_test, std = split(data, test_data, drop)
    print(*[cls for cls in classes if cls != drop])
    for name in models.keys():
        model, y_pred = train_test(models[name], X_train, y_train, X_test, y_test)
        prediction = images_pred(y_pred)
        print(name," Image Accuracy: ", images_acc(y_test, prediction))
        report = classification_report(y_test, y_pred, output_dict = True)
        cr = pd.DataFrame(report).transpose()
        print(cr)
    print('\n\n')

fatty cirrhosis
RFC  Image Accuracy:  0.8636363636363636
              precision    recall  f1-score     support
cirrhosis      0.909091  0.512821  0.655738   78.000000
fatty          0.798942  0.974194  0.877907  155.000000
accuracy       0.819742  0.819742  0.819742    0.819742
macro avg      0.854016  0.743507  0.766822  233.000000
weighted avg   0.835816  0.819742  0.803533  233.000000
MLP  Image Accuracy:  0.8636363636363636
              precision    recall  f1-score     support
cirrhosis      0.738462  0.615385  0.671329   78.000000
fatty          0.821429  0.890323  0.854489  155.000000
accuracy       0.798283  0.798283  0.798283    0.798283
macro avg      0.779945  0.752854  0.762909  233.000000
weighted avg   0.793654  0.798283  0.793174  233.000000
SVC  Image Accuracy:  0.8636363636363636
              precision    recall  f1-score     support
cirrhosis      0.877551  0.551282  0.677165   78.000000
fatty          0.809783  0.961290  0.879056  155.000000
accuracy       0.8240

In [18]:
# feat importance
files = ['fatty_normal', 'cirrhosis_fatty', 'cirrhosis_normal']
features_acc={}
for name in files:
    features_acc[name] = pd.read_csv(f'dataset/manual selection/random/{name}.csv', index_col = 0)

In [30]:
normal_fatty_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=31
                    )

feat_imp = features_acc['fatty_normal']['ANN Accuracy'].sort_values(ascending=False)
normal_fatty_cols = feat_imp.index[0:20].insert(0,'length')

X_train, y_train, X_test, y_test, normal_fatty_std = split(data, test_data, 'cirrhosis', cols = normal_fatty_cols)
model, y_pred = train_test(normal_fatty_mlp, X_train, y_train, X_test, y_test)
normal_fatty_mlp = model
prediction = images_pred(y_pred)
print("Normal/Fatty MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

Normal/Fatty MLP Image Accuracy:  0.896551724137931
              precision    recall  f1-score     support
fatty          0.811189  0.748387  0.778523  155.000000
normal         0.757764  0.818792  0.787097  149.000000
accuracy       0.782895  0.782895  0.782895    0.782895
macro avg      0.784476  0.783590  0.782810  304.000000
weighted avg   0.785004  0.782895  0.782726  304.000000


In [31]:
normal_cirrhosis_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=81
                    )

feat_imp = features_acc['cirrhosis_normal']['ANN Accuracy'].sort_values(ascending=False)
normal_cirrhosis_cols = feat_imp.index[0:7].insert(0,'length')
X_train, y_train, X_test, y_test, normal_cirrhosis_std = split(data, test_data, 'fatty', cols = normal_cirrhosis_cols)
model, y_pred = train_test(normal_cirrhosis_mlp, X_train, y_train, X_test, y_test)
normal_cirrhosis_mlp = model
prediction = images_pred(y_pred)
print("normal/cirrhosis MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

normal/cirrhosis MLP Image Accuracy:  0.8421052631578947
              precision    recall  f1-score     support
cirrhosis      0.785714  0.423077  0.550000   78.000000
normal         0.756757  0.939597  0.838323  149.000000
accuracy       0.762115  0.762115  0.762115    0.762115
macro avg      0.771236  0.681337  0.694162  227.000000
weighted avg   0.766707  0.762115  0.739252  227.000000


In [32]:
fatty_cirrhosis_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=53
                    )

feat_imp = features_acc['cirrhosis_fatty']['ANN Accuracy'].sort_values(ascending=False)
fatty_cirrhosis_cols = feat_imp.index[0:25].insert(0,'length')

X_train, y_train, X_test, y_test, fatty_cirrhosis_std = split(data, test_data, 'normal', cols = fatty_cirrhosis_cols)
model, y_pred = train_test(fatty_cirrhosis_mlp, X_train, y_train, X_test, y_test)
fatty_cirrhosis_mlp = model
prediction = images_pred(y_pred)
print("cirrhosis/Fatty MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

cirrhosis/Fatty MLP Image Accuracy:  0.9545454545454546
              precision    recall  f1-score     support
cirrhosis      0.885246  0.692308  0.776978   78.000000
fatty          0.860465  0.954839  0.905199  155.000000
accuracy       0.866953  0.866953  0.866953    0.866953
macro avg      0.872856  0.823573  0.841089  233.000000
weighted avg   0.868761  0.866953  0.862275  233.000000


In [33]:
models = {
    "normal_fatty": (normal_fatty_mlp, normal_fatty_std, normal_fatty_cols),
    "normal_cirrhosis": (normal_cirrhosis_mlp, normal_cirrhosis_std, normal_cirrhosis_cols),
    "fatty_cirrhosis": (fatty_cirrhosis_mlp, fatty_cirrhosis_std, fatty_cirrhosis_cols)
}

X_test = test_data.copy()
y_test = X_test.pop('target')

In [34]:
pd.set_option('display.max_rows', None)
predictions = {}
for name in models.keys():
    cols = models[name][2]
    X_test = test_data.copy()
    y_test = X_test.pop('target')
    X_test = pd.DataFrame(models[name][1].transform(X_test[cols]), columns = cols, index = X_test.index)
    X_test =  X_test[cols]
    y_pred = pd.Series(models[name][0].predict(X_test),index=y_test.index)
    predictions[name] = images_pred(y_pred)
    
image_names = np.unique(y_test.index)

In [35]:
image_name = np.unique(y_test.index)
final_pred = {}
for image in image_name:
    pred = {
        'normal': 0,
        'fatty': 0,
        'cirrhosis': 0
    }
    for model in predictions.keys():
        pred[predictions[model][image]] += 1
    cls = max(pred, key=pred.get)
    if pred[cls] == 1:
        final_pred[image] = 'abstain'
    else: final_pred[image] = cls

In [36]:
images_acc(y_test, final_pred)

0.8285714285714286

In [37]:
y_test = y_test[~y_test.index.duplicated(keep='first')].sort_index()
y_pred = pd.Series(final_pred).sort_index()
abstain = y_pred[y_pred=='abstain'].index

y_test = y_test.drop(abstain)
y_pred = y_pred.drop(abstain)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)
print("Abstention Rate: ", len(abstain)/(len(y_pred)+len(abstain)))

              precision    recall  f1-score    support
cirrhosis      1.000000  0.600000  0.750000   5.000000
fatty          0.933333  0.875000  0.903226  16.000000
normal         0.750000  0.923077  0.827586  13.000000
accuracy       0.852941  0.852941  0.852941   0.852941
macro avg      0.894444  0.799359  0.826937  34.000000
weighted avg   0.873039  0.852941  0.851772  34.000000
Abstention Rate:  0.02857142857142857
