# Feature Extraction
## Importing Libraries

In [1]:
import cv2 as cv
import numpy as np
import os
from skimage.feature.texture import greycomatrix
from skimage.feature.texture import greycoprops
from skimage.measure import shannon_entropy
import pyfeats
import pandas as pd
import multiprocessing as mlp
import math
import feature_extraction as fe

## Define Feature Extraction functions

### Read dataset images

In [2]:
def read_images(folder = "dataset/train",
                classes = [
                            "normal",
                            "fatty",
#                             "cirrhosis"
                        ]):
    image_names = {}
    images = []
    # Get all image names in folders
    for cls in classes:
        image_names[cls] = os.listdir(f'{folder}/{cls}')

    # read all images to list
    for cls in classes:
        for name in image_names[cls]:
            img = cv.imread(f'{folder}/{cls}/{name}', cv.IMREAD_GRAYSCALE)
            images.append((img,cls))
    return images

### Extract ROIs from image

In [3]:
def extract_roi(img, start , size = (32,32)):
    roi = img[start[0]:start[0]+size[0],start[1]:start[1]+size[1]]
    mask = np.zeros(img.shape)
    mask[start[0]:start[0]+size[0],start[1]:start[1]+size[1]] = 1
    return roi, mask

### Extract Features from ROIs

In [9]:
def feature_extraction(img, roi_pos = [
        (160,230),
        (118,224),
        (241,151),
        (120,420),
        (170,300),
        (400,200),
        (300,120),
        (240,240),
        (360,160)
    ]):
    roi_mask_arr = []
    for pos in roi_pos:
        roi_mask_arr.append(extract_roi(img, pos))
    
    # 0 45 90 135 degrees
    angles = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4]
    
    da_dict = {
        0: "d1_0",
        1: "d1_45",
        2: "d1_90",
        3: "d1_135",
        
        4: "d2_0",
        5: "d2_45",
        6: "d2_90",
        7: "d2_135",
        
        8: "d3_0",
        9: "d3_45",
        10: "d3_90",
        11: "d3_135",
        
    }
    
    feat_arr = []
    for roi, mask in roi_mask_arr:
        if 0 in roi or roi.shape != (32,32) or 0 in image.shape:
            continue
        features = {}
        
        glcm_mtx = greycomatrix(roi, distances = [1,2,3], angles = angles, levels = 256)
        con = greycoprops(glcm_mtx, 'contrast').flatten()
        hom = greycoprops(glcm_mtx, 'homogeneity').flatten()
        en = greycoprops(glcm_mtx, 'energy').flatten()
        corr = greycoprops(glcm_mtx, 'correlation').flatten()
        
        for j in range(len(da_dict)):
            features[f'contrast_{da_dict[j]}'] = con[j]
            features[f'homogeneity_{da_dict[j]}'] = hom[j]
            features[f'energy_{da_dict[j]}'] = en[j]
            features[f'correlation_{da_dict[j]}'] = corr[j]
            
        features[f'entropy'] = shannon_entropy(roi)

        feat, labels = pyfeats.glrlm_features(img, mask, 256)
        for i in range(len(labels)):
            features[labels[i]] = feat[i]
#         glrlm = {l : f for l,f in zip(labels,feat)}
#         features[f'longRunEmphasis'] = glrlm['GLRLM_LongRunEmphasis']
#         features[f'runPercentage'] = glrlm['GLRLM_RunPercentage']
        
        feat_arr.append(features)
        
    return feat_arr

### Construct dataframe from ROI features

In [5]:
def build_dataframe(images):
    # dataframe consists of features of 1 ROI per image
    # column name roiNum_feature
    data = pd.DataFrame()

    for img, cls in images:
        feat_arr = feature_extraction(img)
        for row in feat_arr:
            row['target'] = cls
            data = data.append(row,ignore_index=True)
    return data

### Construct dataframe using multiprocessing
### Reduced runtime by 82%

In [6]:
def build_with_mlp(images, n=9): 
    pool = mlp.Pool(n)
    results = pool.map(fe.build_dataframe,np.array_split(images,n))
    return results

## Feature Analysis and Selection

### Extract Features and build dataframe

In [7]:
%%time

images = fe.read_images()
mlp_data = build_with_mlp(images)
data = pd.DataFrame()
for frame in mlp_data:
    data = data.append(frame)

data.to_csv("dataset/train.csv",index = False)

# data = pd.read_csv('dataset/train.csv')

data.describe()

  return array(a, dtype, copy=False, order=order)


Wall time: 3min 41s


Unnamed: 0,GLRLM_GrayLevelNo-Uniformity,GLRLM_HighGrayLevelRunEmphasis,GLRLM_LongRunEmphasis,GLRLM_LongRunHighGrayLevelEmphasis,GLRLM_LongRunLowGrayLevelEmphasis,GLRLM_LowGrayLevelRunEmphasis,GLRLM_RunLengthNonUniformity,GLRLM_RunPercentage,GLRLM_Short owGrayLevelEmphasis,GLRLM_ShortRunEmphasis,...,homogeneity_d1_45,homogeneity_d1_90,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90
count,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,...,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0
mean,1925.232105,5774.272061,1.503124,10155.578381,1.0,25185750000000.0,121919.002542,0.843519,25185750000000.0,0.94132,...,0.172105,0.147955,0.15737,0.134351,0.172105,0.108392,0.125869,0.107984,0.118957,0.10362
std,524.995394,2437.493587,0.321007,2859.686127,0.0,40624640000000.0,19505.427337,0.14085,40624640000000.0,0.024265,...,0.099986,0.080487,0.092244,0.073891,0.099986,0.067247,0.075835,0.068726,0.07277,0.069491
min,1078.202338,1464.268635,1.284691,4608.793048,1.0,1240330000000.0,102848.447946,0.707023,1240330000000.0,0.886683,...,0.054134,0.059946,0.053075,0.049359,0.054134,0.039637,0.043536,0.034124,0.037732,0.031738
25%,1500.073308,3733.833612,1.304044,7717.184507,1.0,1334575000000.0,104618.309135,0.711475,1334575000000.0,0.922127,...,0.099556,0.090622,0.084655,0.073377,0.099556,0.062077,0.070193,0.058787,0.068558,0.056954
50%,1865.672812,5520.22913,1.336752,9688.416212,1.0,6943478000000.0,105921.335451,0.715179,6943478000000.0,0.959842,...,0.143184,0.115324,0.119014,0.118352,0.143184,0.080068,0.090022,0.088096,0.093659,0.074901
75%,2382.670557,7578.758296,1.559836,12182.917893,1.0,30358610000000.0,136428.268996,0.981161,30358610000000.0,0.962491,...,0.216489,0.188462,0.214921,0.173649,0.216489,0.135791,0.16866,0.132393,0.146854,0.130552
max,3590.511843,11461.201056,3.137791,18661.048823,1.0,295094000000000.0,157672.709448,1.079653,295094000000000.0,0.965812,...,0.576049,0.515312,0.565827,0.457325,0.576049,0.41744,0.480954,0.443175,0.436596,0.401612


In [11]:
%%time

test_images = fe.read_images("dataset/test")
mlp_data = build_with_mlp(test_images)
test_data = pd.DataFrame()
for frame in mlp_data:
    test_data = test_data.append(frame)

test_data.to_csv("dataset/test.csv",index = False)

# data = pd.read_csv('dataset/test.csv')

test_data.describe()

  return array(a, dtype, copy=False, order=order)


Wall time: 52.2 s


Unnamed: 0,GLRLM_GrayLevelNo-Uniformity,GLRLM_HighGrayLevelRunEmphasis,GLRLM_LongRunEmphasis,GLRLM_LongRunHighGrayLevelEmphasis,GLRLM_LongRunLowGrayLevelEmphasis,GLRLM_LowGrayLevelRunEmphasis,GLRLM_RunLengthNonUniformity,GLRLM_RunPercentage,GLRLM_Short owGrayLevelEmphasis,GLRLM_ShortRunEmphasis,...,homogeneity_d1_45,homogeneity_d1_90,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90
count,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,...,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,1903.946076,5371.710352,1.472699,9556.628554,1.0,20626570000000.0,126918.040225,0.875497,20626570000000.0,0.939439,...,0.190564,0.161031,0.171177,0.145738,0.190564,0.121638,0.136812,0.120364,0.132531,0.117174
std,420.951375,1743.432747,0.180305,1912.487578,0.0,16911160000000.0,17877.257511,0.131548,16911160000000.0,0.019281,...,0.106736,0.078074,0.086657,0.068175,0.106736,0.06719,0.071393,0.06725,0.076055,0.069441
min,1061.252459,3914.446757,1.288211,7516.92609,1.0,1243992000000.0,103365.102908,0.707909,1243992000000.0,0.918801,...,0.066058,0.068746,0.058558,0.046886,0.066058,0.0444,0.046755,0.037139,0.040142,0.039434
25%,1688.037278,4088.271309,1.298737,8089.978417,1.0,1561496000000.0,105090.662445,0.712104,1561496000000.0,0.922239,...,0.103642,0.094917,0.09007,0.080329,0.103642,0.065961,0.073897,0.067267,0.072608,0.064385
50%,1865.239006,4790.254984,1.523177,8445.810661,1.0,28174210000000.0,135266.932348,0.947234,28174210000000.0,0.92584,...,0.168007,0.145468,0.160182,0.145143,0.168007,0.101065,0.124574,0.113933,0.109772,0.097475
75%,2223.748641,6595.428845,1.551953,11191.496754,1.0,29259530000000.0,140571.590221,0.986328,29259530000000.0,0.962412,...,0.254397,0.21796,0.228532,0.188054,0.254397,0.16064,0.180768,0.150438,0.172394,0.149905
max,2549.296693,10202.813696,1.943695,14619.311165,1.0,59270140000000.0,149313.519267,1.031691,59270140000000.0,0.964516,...,0.54898,0.406083,0.39453,0.359026,0.54898,0.373591,0.35871,0.369174,0.402374,0.381117
