# Feature Extraction
## Importing Libraries

In [43]:
import cv2 as cv
import numpy as np
import os
from skimage.feature.texture import greycomatrix
from skimage.feature.texture import greycoprops
from skimage.measure import shannon_entropy
import pyfeats
import pandas as pd
import multiprocessing as mlp
import math
import feature_extraction as fe

In [44]:
# from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

## Define Feature Extraction functions

### Read dataset images

In [45]:
def read_images(folder = "dataset/train",
                classes = [
                            "normal",
                            "fatty",
#                             "cirrhosis"
                        ]):
    image_names = {}
    images = []
    # Get all image names in folders
    for cls in classes:
        image_names[cls] = os.listdir(f'{folder}/{cls}')

    # read all images to list
    for cls in classes:
        for name in image_names[cls]:
            mask = []
            with open(f'dataset/masks/{name[0:-4]}.txt', 'r') as file:
                data = file.read()
                data = data.strip().split('\n')
                for line in data:
                    x, y = line.split(',')
                    mask.append((int(y),int(x)))
            img = cv.imread(f'{folder}/{cls}/{name}', cv.IMREAD_GRAYSCALE)
            images.append((img,cls,mask))
    return images

### Extract ROIs from image

In [46]:
def extract_roi(img, start , size = (32,32)):
    roi = img[start[0]:start[0]+size[0],start[1]:start[1]+size[1]]
    mask = np.zeros(img.shape)
    mask[start[0]:start[0]+size[0],start[1]:start[1]+size[1]] = 1
    return roi, mask

### Extract Features from ROIs

In [47]:
def feature_extraction(img, roi_pos = [
        (160,230),
        (118,224),
        (241,151),
        (120,420),
        (170,300),
        (400,200),
        (300,120),
        (240,240),
        (360,160)
    ]):
    roi_mask_arr = []
    for pos in roi_pos:
        roi_mask_arr.append(extract_roi(img, pos))
    
    # 0 45 90 135 degrees
    angles = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4]
    
    da_dict = {
        0: "d1_0",
        1: "d1_45",
        2: "d1_90",
        3: "d1_135",
        
        4: "d2_0",
        5: "d2_45",
        6: "d2_90",
        7: "d2_135",
        
        8: "d3_0",
        9: "d3_45",
        10: "d3_90",
        11: "d3_135",
        
    }
    
    feat_arr = []
    for roi, mask in roi_mask_arr:
        if roi.shape != (32,32):
            continue
        features = {}
        
        glcm_mtx = greycomatrix(roi, distances = [1,2,3], angles = angles, levels = 256)
        con = greycoprops(glcm_mtx, 'contrast').flatten()
        hom = greycoprops(glcm_mtx, 'homogeneity').flatten()
        en = greycoprops(glcm_mtx, 'energy').flatten()
        corr = greycoprops(glcm_mtx, 'correlation').flatten()
        
        for j in range(len(da_dict)):
            features[f'contrast_{da_dict[j]}'] = con[j]
            features[f'homogeneity_{da_dict[j]}'] = hom[j]
            features[f'energy_{da_dict[j]}'] = en[j]
            features[f'correlation_{da_dict[j]}'] = corr[j]
            
        features[f'entropy'] = shannon_entropy(roi)
#         features[f'var'] = np.var(roi)
#         features[f'mean'] = np.mean(roi)
# GLRLM Features
#         feat, labels = pyfeats.glrlm_features(img, mask, 256)
#         for i in range(len(labels)):
#             features[labels[i]] = feat[i]
#         glrlm = {l : f for l,f in zip(labels,feat)}
#         features[f'longRunEmphasis'] = glrlm['GLRLM_LongRunEmphasis']
#         features[f'runPercentage'] = glrlm['GLRLM_RunPercentage']
        
        feat_arr.append(features)
        
    return feat_arr

### Construct dataframe from ROI features

In [48]:
def build_dataframe(images):
    # dataframe consists of features of 1 ROI per image
    # column name roiNum_feature
    data = pd.DataFrame()

    for img, cls, mask in images:
        feat_arr = feature_extraction(img, roi_pos=mask)
        for row in feat_arr:
            row['target'] = cls
            data = data.append(row,ignore_index=True)
    return data

### Construct dataframe using multiprocessing
### Reduced runtime by 82%

In [49]:
def build_with_mlp(images, n=9): 
    pool = mlp.Pool(n)
    results = pool.map(fe.build_dataframe,np.array_split(images,n))
    return results

## Feature Analysis and Selection

### Extract Features and build dataframe

In [50]:
%%time

# images = read_images('dataset/train')
# mlp_data = build_with_mlp(images)
# data = pd.DataFrame()
# for frame in mlp_data:
#     data = data.append(frame)

# data.to_csv("dataset/masked/masked_train.csv",index = False)

data = pd.read_csv('dataset/masked/mean_train.csv')

data.describe()

Wall time: 75 ms


Unnamed: 0,contrast_d1_0,contrast_d1_135,contrast_d1_45,contrast_d1_90,contrast_d2_0,contrast_d2_135,contrast_d2_45,contrast_d2_90,contrast_d3_0,contrast_d3_135,...,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90,mean,var
count,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,...,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0
mean,42.641099,160.823666,131.501265,132.120369,118.952965,160.823666,131.501265,267.387443,181.271789,273.480895,...,0.174096,0.139964,0.156005,0.10989,0.137497,0.109203,0.114632,0.106308,64.86129,179.340426
std,44.116824,131.308133,110.363571,95.471258,112.225614,131.308133,110.363571,205.530096,157.252923,223.501355,...,0.095455,0.070678,0.079771,0.057531,0.076688,0.056947,0.059201,0.057945,22.996375,149.29273
min,0.711694,5.770031,4.151925,5.266129,1.972917,5.770031,4.151925,9.3375,3.769397,9.34,...,0.049637,0.048944,0.044883,0.03055,0.041362,0.035131,0.036691,0.03357,2.023438,6.862118
25%,8.12878,50.94719,39.89282,45.828629,26.265625,50.94719,39.89282,86.159115,46.645474,88.945556,...,0.089303,0.082083,0.088811,0.065395,0.073321,0.063989,0.06774,0.060833,49.284668,64.109409
50%,18.470262,113.25078,86.879292,115.373488,60.597917,113.25078,86.879292,233.2,107.332435,215.263333,...,0.16906,0.129166,0.144265,0.098724,0.129677,0.098221,0.104749,0.093322,66.624023,155.122579
75%,72.994708,256.390479,221.417534,212.150454,206.879427,256.390479,221.417534,406.415104,309.508082,410.924167,...,0.228006,0.175648,0.197577,0.138103,0.178218,0.138499,0.144265,0.13635,80.556396,240.393167
max,198.784274,770.995838,803.924037,543.011089,561.59375,770.995838,803.924037,1328.046875,895.590517,1479.873333,...,0.741631,0.701923,0.775769,0.67395,0.697514,0.620327,0.699876,0.623982,132.904297,977.87793


In [51]:
%%time

# test_images = read_images("dataset/test")
# mlp_data = build_with_mlp(test_images)
# test_data = pd.DataFrame()
# for frame in mlp_data:
#     test_data = test_data.append(frame)

# test_data.to_csv("dataset/masked/masked_test.csv",index = False)

test_data = pd.read_csv('dataset/masked/mean_test.csv')

test_data.describe()

Wall time: 64.8 ms


Unnamed: 0,contrast_d1_0,contrast_d1_135,contrast_d1_45,contrast_d1_90,contrast_d2_0,contrast_d2_135,contrast_d2_45,contrast_d2_90,contrast_d3_0,contrast_d3_135,...,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90,mean,var
count,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,...,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0
mean,32.168814,148.163276,122.050563,124.209149,94.573793,148.163276,122.050563,253.357228,152.826371,257.965657,...,0.175539,0.133182,0.153083,0.1084,0.135835,0.10768,0.111735,0.104816,66.36053,180.521505
std,34.955737,117.92028,106.393304,93.245051,93.60961,117.92028,106.393304,206.878429,136.815513,215.657733,...,0.070479,0.04576,0.063136,0.046006,0.055432,0.045693,0.046821,0.045791,22.428803,139.607247
min,3.028226,21.087409,8.798127,14.202621,9.723958,21.087409,8.798127,24.4375,17.729526,23.526667,...,0.063136,0.057985,0.055655,0.03174,0.046913,0.037576,0.039154,0.034577,21.544922,16.106956
25%,8.112147,58.728668,41.527836,48.006804,26.839323,58.728668,41.527836,101.527344,48.959591,96.380556,...,0.104911,0.089177,0.095746,0.066455,0.082069,0.067818,0.070661,0.06358,49.64624,68.745688
50%,12.838206,95.281998,74.869407,83.175907,42.577083,95.281998,74.869407,169.665625,78.901401,162.496111,...,0.189176,0.140066,0.148571,0.110767,0.140691,0.104527,0.110335,0.102638,66.181152,136.870718
75%,50.22505,238.208117,187.405307,193.13256,151.395833,238.208117,187.405307,391.334375,247.413793,400.707222,...,0.228617,0.166946,0.190679,0.134331,0.17637,0.132703,0.141772,0.12869,82.778564,254.344851
max,160.758065,501.284079,484.985432,450.571573,415.38125,501.284079,484.985432,1062.876042,626.979526,1106.467778,...,0.333751,0.242221,0.350524,0.240615,0.272176,0.258909,0.2456,0.236305,117.783203,623.635558


In [52]:
from sklearn.preprocessing import StandardScaler

X_train = data.copy()
y_train = X_train.pop('target')
X_test = test_data.copy()
y_test = X_test.pop('target')

std = StandardScaler()
std.fit(X_train)
X_train = pd.DataFrame(std.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(std.transform(X_test), columns = X_test.columns)

In [53]:
rfc = RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 500, max_depth=6, criterion='entropy')

rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.730769  0.612903  0.666667   93.000000
normal         0.581395  0.704225  0.636943   71.000000
accuracy       0.652439  0.652439  0.652439    0.652439
macro avg      0.656082  0.658564  0.651805  164.000000
weighted avg   0.666101  0.652439  0.653798  164.000000


In [54]:
dtc = DecisionTreeClassifier(random_state = 42, criterion = 'entropy', max_depth = 2, max_features = 'log2', splitter = 'best')

dtc = dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
dtc.fit(X_train, y_train)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score    support
fatty          0.636364  0.376344  0.472973   93.00000
normal         0.467890  0.718310  0.566667   71.00000
accuracy       0.524390  0.524390  0.524390    0.52439
macro avg      0.552127  0.547327  0.519820  164.00000
weighted avg   0.563427  0.524390  0.513535  164.00000


In [55]:
mlpc = MLPClassifier(max_iter=300,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001)

mlpc = mlpc.fit(X_train,y_train)
y_pred = mlpc.predict(X_test)
mlpc.fit(X_train, y_train)
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.714286  0.752688  0.732984   93.000000
normal         0.651515  0.605634  0.627737   71.000000
accuracy       0.689024  0.689024  0.689024    0.689024
macro avg      0.682900  0.679161  0.680361  164.000000
weighted avg   0.687111  0.689024  0.687420  164.000000


In [56]:
knnc = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=10, n_neighbors=2, p=3,
                     weights='uniform')
knnc = knnc.fit(X_train,y_train)
y_pred = knnc.predict(X_test)
knnc.fit(X_train, y_train)
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.625000  0.806452  0.704225   93.000000
normal         0.590909  0.366197  0.452174   71.000000
accuracy       0.615854  0.615854  0.615854    0.615854
macro avg      0.607955  0.586324  0.578200  164.000000
weighted avg   0.610241  0.615854  0.595106  164.000000


In [57]:
clf = svm.SVC()

clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
clf.fit(X_train, y_train)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.670455  0.634409  0.651934   93.000000
normal         0.552632  0.591549  0.571429   71.000000
accuracy       0.615854  0.615854  0.615854    0.615854
macro avg      0.611543  0.612979  0.611681  164.000000
weighted avg   0.619446  0.615854  0.617081  164.000000


## Train test mixed roi model

In [58]:
full_data = data.append(test_data)

X = full_data.copy()
y = X.pop('target')

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

std = StandardScaler()
std.fit(X_train)
X_train = pd.DataFrame(std.transform(X_train), columns = X_train.columns)

X_test = pd.DataFrame(std.transform(X_test), columns = X_test.columns)

rfc = RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 500, max_depth=6, criterion='entropy')

rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.781513  0.768595  0.775000  121.000000
normal         0.764706  0.777778  0.771186  117.000000
accuracy       0.773109  0.773109  0.773109    0.773109
macro avg      0.773109  0.773186  0.773093  238.000000
weighted avg   0.773250  0.773109  0.773125  238.000000


In [60]:
dtc = DecisionTreeClassifier(random_state = 42, criterion = 'entropy', max_depth = 2, max_features = 'log2', splitter = 'best')

dtc = dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
dtc.fit(X_train, y_train)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.746667  0.462810  0.571429  121.000000
normal         0.601227  0.837607  0.700000  117.000000
accuracy       0.647059  0.647059  0.647059    0.647059
macro avg      0.673947  0.650208  0.635714  238.000000
weighted avg   0.675169  0.647059  0.634634  238.000000


In [61]:
mlpc = MLPClassifier(max_iter=300,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001)

mlpc = mlpc.fit(X_train,y_train)
y_pred = mlpc.predict(X_test)
mlpc.fit(X_train, y_train)
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.745902  0.752066  0.748971  121.000000
normal         0.741379  0.735043  0.738197  117.000000
accuracy       0.743697  0.743697  0.743697    0.743697
macro avg      0.743640  0.743554  0.743584  238.000000
weighted avg   0.743678  0.743697  0.743675  238.000000


In [62]:
knnc = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=10, n_neighbors=2, p=3,
                     weights='uniform')
knnc = knnc.fit(X_train,y_train)
y_pred = knnc.predict(X_test)
knnc.fit(X_train, y_train)
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.691358  0.925620  0.791519  121.000000
normal         0.881579  0.572650  0.694301  117.000000
accuracy       0.752101  0.752101  0.752101    0.752101
macro avg      0.786468  0.749135  0.742910  238.000000
weighted avg   0.784870  0.752101  0.743727  238.000000


In [63]:
clf = svm.SVC()

clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
clf.fit(X_train, y_train)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

              precision    recall  f1-score     support
fatty          0.742188  0.785124  0.763052  121.000000
normal         0.763636  0.717949  0.740088  117.000000
accuracy       0.752101  0.752101  0.752101    0.752101
macro avg      0.752912  0.751536  0.751570  238.000000
weighted avg   0.752732  0.752101  0.751763  238.000000
