# Feature extraction

### 1.) Top correlations

In [1]:
import pandas as pd 
table = pd.read_csv('all_measurements_normalized_image.csv')

In [2]:
table.corr().to_csv('correlation_matrix.csv')

In [3]:
to_drop = ["filename"]
table_shortened = table.drop(to_drop, axis=1)

In [8]:
def get_redundant_pairs(table_shortened):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = table_shortened.columns
    for i in range(0, table_shortened.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(table_shortened, n=5):
    au_corr = table_shortened.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(table_shortened)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
top_correlations = pd.DataFrame(get_top_abs_correlations(table_shortened, 56))
top_correlations.columns = ['correlation']
top_correlations
top_correlations.to_excel("top_correlations.xlsx", sheet_name='Sheet_name_1') 

Top Absolute Correlations


### 2.) Correlation filtering by computer

In [9]:
from library_crystals import correlation_filter

In [10]:
correlation_filter('all_measurements_normalized_image.csv', 'correlation_filtered_normalized.csv')


['bbox_area', 'convex_area', 'equivalent_diameter', 'perimeter_skimage', 'perimeter_crofton', 'feret_diameter_max', 'major_axis_length', 'standard_deviation_intensity', 'maximum', 'mean', 'minimum', 'sigma', 'elongation', 'feret_diameter', 'flatness', 'roundness_simpleitk', 'equivalent_ellipsoid_diameter_0', 'equivalent_ellipsoid_diameter_1', 'equivalent_spherical_perimeter', 'equivalent_spherical_radius', 'number_of_pixels', 'perimeter_simpleitk', 'perimeter_on_border']
Index(['Unnamed: 0', 'label', 'area', 'max_intensity', 'mean_intensity',
       'min_intensity', 'solidity', 'minor_axis_length', 'eccentricity',
       'aspect_ratio', 'roundness_skimage', 'circularity', 'median', 'sum',
       'variance', 'number_of_pixels_on_border', 'perimeter_on_border_ratio',
       'filename'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,label,area,max_intensity,mean_intensity,min_intensity,solidity,minor_axis_length,eccentricity,aspect_ratio,roundness_skimage,circularity,median,sum,variance,number_of_pixels_on_border,perimeter_on_border_ratio,filename
0,0,1,1031,6.700000,2.412512,0.000000,0.917260,21.988822,0.931903,2.757047,0.357171,0.589689,2.300977,2487.300000,1.715426,0,0.000000,img01_20220429_ET358_56hpf_1.tif
1,1,2,716,3.000000,1.067877,0.000000,0.837427,16.007419,0.965269,3.827640,0.242839,0.298642,0.986133,764.600000,0.265792,0,0.000000,img01_20220429_ET358_56hpf_1.tif
2,2,3,2629,11.000000,2.758273,0.000000,0.738898,61.312215,0.495087,1.150955,0.672187,0.356395,2.181445,7251.500000,3.731877,0,0.000000,img01_20220429_ET358_56hpf_1.tif
3,3,4,51,2.600000,1.239216,0.200000,0.980769,6.114710,0.828159,1.784145,0.545592,0.973584,1.225195,63.200000,0.309631,10,0.370292,img01_20220429_ET358_56hpf_1.tif
4,4,5,61,2.600000,0.929508,0.100000,0.726190,6.594027,0.888910,2.183019,0.374821,0.682198,0.806836,56.700000,0.224448,0,0.000000,img01_20220429_ET358_56hpf_1.tif
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,248,18,94,4.555556,0.990544,0.111111,0.817391,7.768466,0.889713,2.190479,0.413323,0.588574,0.906033,93.111111,0.352226,0,0.000000,img18_20220520_hsp-mKate-DN-dynactin_56hpf_5.tif
249,249,19,280,2.777778,0.936111,0.000000,0.723514,12.948729,0.926686,2.660733,0.300339,0.283033,0.906033,262.111111,0.211798,0,0.000000,img18_20220520_hsp-mKate-DN-dynactin_56hpf_5.tif
250,250,20,89,2.555556,1.018727,0.111111,0.773913,5.100794,0.977147,4.704469,0.196790,0.440556,0.978516,90.666667,0.239965,0,0.000000,img18_20220520_hsp-mKate-DN-dynactin_56hpf_5.tif
251,251,21,358,3.444444,1.021105,0.000000,0.624782,20.270777,0.806044,1.689601,0.388583,0.242783,0.978516,365.555556,0.259055,0,0.000000,img18_20220520_hsp-mKate-DN-dynactin_56hpf_5.tif


### 3.) Correlation filtering manually

#### 3.1) All columns that should be kept

In [36]:
keep = table[['label', 'aspect_ratio', 'max_intensity', 'min_intensity', 'perimeter_skimage', 'area', 'mean_intensity', 'major_axis_length', 'minor_axis_length', 'circularity', 'solidity', 'eccentricity', 'roundness_skimage', 'median', 'sum', 'variance', 'perimeter_on_border','perimeter_on_border_ratio','filename']]
keep.to_csv('size_shape_intensity.csv')

In [13]:
keep.columns

Index(['label', 'aspect_ratio', 'max_intensity', 'min_intensity',
       'perimeter_skimage', 'area', 'mean_intensity', 'major_axis_length',
       'minor_axis_length', 'circularity', 'solidity', 'eccentricity',
       'roundness_skimage', 'median', 'sum', 'variance', 'perimeter_on_border',
       'perimeter_on_border_ratio', 'filename'],
      dtype='object')

#### 3.2) Intensity-, size- and shape-based table

In [17]:
df_intensity = keep[['label','max_intensity', 'mean_intensity', 'min_intensity','median', 'sum', 'variance', 'filename']]
df_size = keep[['label','area','filename']]
df_shape = keep[['label', 'aspect_ratio','perimeter_skimage', 'major_axis_length',
       'minor_axis_length', 'circularity', 'solidity', 'eccentricity',
       'roundness_skimage', 'perimeter_on_border',
       'perimeter_on_border_ratio', 'filename']]

In [31]:
df_intensity.to_csv('intensity.csv')
df_shape.to_csv('shape.csv')
df_size.to_csv('size.csv')

#### 3.3) Size-intensity-based table

In [32]:
df_size_intensity = pd.merge(df_size, df_intensity, on=('label','filename'))
df_size_intensity
df_size_intensity_ordered = df_size_intensity.iloc[0:,[0, 1, 3, 4, 5, 6, 7, 8, 2]]
df_size_intensity_ordered.to_csv('size_intensity.csv')

#### 3.4) Size-shape-based table

In [33]:
df_size_shape = pd.merge(df_size, df_shape, on=('label','filename'))
df_size_shape
df_size_shape_ordered = df_size_shape.iloc[0:,[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11,12, 2]]
df_size_shape_ordered.to_csv('size_shape.csv')

#### 3.5) Shape-intensity-based table

In [35]:
df_shape_intensity = pd.merge(df_shape, df_intensity, on=('label','filename'))
df_shape_intensity
df_shape_intensity_ordered = df_shape_intensity.iloc[0:,[0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,11]]
df_shape_intensity_ordered.to_csv('shape_intensity.csv')

#### 3.6) Resulting function for column selection

In [None]:
def column_selection(table):
    '''
    This function selects measurements from table and creates tables with different combination of size, shape and intensity measurements.
    It saves them as csv-files
    '''
    
    # correlation-filtered table with intensity, shape and size measurements
    keep = table[['label', 'aspect_ratio', 'max_intensity', 'min_intensity', 'perimeter_skimage', 'area', 'mean_intensity', 'major_axis_length', 'minor_axis_length', 'circularity', 'solidity', 'eccentricity', 'roundness_skimage', 'median', 'sum', 'variance', 'perimeter_on_border','perimeter_on_border_ratio','filename']]
    keep.to_csv('size_shape_intensity.csv')
    
    #intensity table
    df_intensity = keep[['label','max_intensity', 'mean_intensity', 'min_intensity','median', 'sum', 'variance', 'filename']]
    df_intensity.to_csv('intensity.csv')
    
    #size table
    df_size = keep[['label','area','filename']]
    df_size.to_csv('size.csv')
    
    #shape table
    df_shape = keep[['label', 'aspect_ratio','perimeter_skimage', 'major_axis_length',
       'minor_axis_length', 'circularity', 'solidity', 'eccentricity',
       'roundness_skimage', 'perimeter_on_border',
       'perimeter_on_border_ratio', 'filename']]
    df_shape.to_csv('shape.csv')
    
    #size intensity table
    df_size_intensity = pd.merge(df_size, df_intensity, on=('label','filename'))
    df_size_intensity_ordered = df_size_intensity.iloc[0:,[0, 1, 3, 4, 5, 6, 7, 8, 2]]
    df_size_intensity_ordered.to_csv('size_intensity.csv')
    
    #size shape table
    df_size_shape = pd.merge(df_size, df_shape, on=('label','filename'))
    df_size_shape_ordered = df_size_shape.iloc[0:,[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11,12, 2]]
    df_size_shape_ordered.to_csv('size_shape.csv')
    
    #df_shape_intensity = pd.merge(df_shape, df_intensity, on=('label','filename'))
    df_shape_intensity = pd.merge(df_shape, df_intensity, on=('label','filename'))
    df_shape_intensity_ordered = df_shape_intensity.iloc[0:,[0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,11]]
    df_shape_intensity_ordered.to_csv('shape_intensity.csv')