## Dataset Analysis

In [25]:
import os
import pandas as pd

Read extracted features dataset into a data frame

In [26]:
data_dir = os.path.join(os.getcwd(), "data", "extracted_features.csv")
data_df = pd.read_csv(data_dir)

# Shape of data frame
print('Data frame dimensions are:', data_df.shape)

Data frame dimensions are: (2734, 128)


### Preprocessing

Drop columns that have only 1 unique value

In [27]:
drop_columns = []
count = 0

for column in data_df.columns:
    unique_values = data_df[column].nunique()
    if unique_values == 1:
        drop_columns.append(column)
        count += 1

print("There are", count, "columns with unique values.")
print("These columns are:", drop_columns)

There are 13 columns with unique values.
These columns are: ['diagnostics_Versions_PyRadiomics', 'diagnostics_Versions_Numpy', 'diagnostics_Versions_SimpleITK', 'diagnostics_Versions_PyWavelet', 'diagnostics_Versions_Python', 'diagnostics_Configuration_Settings', 'diagnostics_Configuration_EnabledImageTypes', 'diagnostics_Image-original_Dimensionality', 'diagnostics_Image-original_Spacing', 'diagnostics_Image-original_Size', 'diagnostics_Image-original_Minimum', 'diagnostics_Mask-original_Spacing', 'diagnostics_Mask-original_Size']


In [28]:
data_df.drop(columns = drop_columns, inplace = True)

# Shape of data frame
print('New data frame dimensions are:', data_df.shape)

New data frame dimensions are: (2734, 115)


### Analysis

Remove all categorical values as they are not important for further numerical analysis

In [29]:
categorical_feats = data_df.dtypes[data_df.dtypes == "object"].index
categorical_column_names = list(categorical_feats)

print("Number of categorical features:", len(categorical_column_names))
print("Categorical column names:", categorical_column_names)

Number of categorical features: 7
Categorical column names: ['diagnostics_Image-original_Hash', 'diagnostics_Mask-original_Hash', 'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_CenterOfMassIndex', 'diagnostics_Mask-original_CenterOfMass', 'name', 'provider']


In [30]:
df_analysis = data_df.drop(columns = categorical_column_names)

# Drop class column as well
df_analysis.drop(columns = ["class"], inplace = True)

Split data frame into 2 smaller data frames based on the image class

In [31]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

class_0_df = df_analysis[data_df['class'] == 0]
class_1_df = df_analysis[data_df['class'] == 1]

In [32]:
# Statistical values of columns for first data frame
class_0_df.describe()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape2D_Elongation,original_shape2D_MajorAxisLength,original_shape2D_MaximumDiameter,original_shape2D_MeshSurface,original_shape2D_MinorAxisLength,original_shape2D_Perimeter,...,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,patient
count,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,...,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0
mean,94.752,255.0,50523.693,1.176,0.663,295.315,312.015,50523.141,190.61,923.095,...,0.032,5.312,0.387,16.279,48.492,2926.117,15.751,0.034,0.066,965.213
std,27.413,0.0,56971.959,0.57,0.191,161.714,175.185,56971.981,121.929,555.934,...,0.032,0.529,0.093,43.649,71.143,54034.149,4.776,0.014,0.12,809.069
min,20.228,255.0,2.0,1.0,0.0,4.619,3.606,1.0,0.0,5.657,...,0.007,0.918,0.065,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,76.129,255.0,10580.0,1.0,0.546,158.527,164.508,10579.5,92.728,456.718,...,0.018,5.036,0.333,4.356,10.567,0.0,13.137,0.023,0.016,75.0
50%,93.43,255.0,30920.0,1.0,0.69,280.814,294.267,30920.0,163.87,846.441,...,0.025,5.371,0.38,7.225,24.93,0.001,15.551,0.033,0.033,692.0
75%,111.945,255.0,69037.0,1.0,0.804,409.267,427.293,69036.25,261.854,1272.645,...,0.037,5.653,0.441,14.001,55.388,0.002,18.331,0.042,0.075,1869.0
max,224.354,255.0,262143.0,8.0,1.0,748.147,723.371,262142.5,591.2,4339.164,...,0.625,6.94,1.0,847.99,575.792,1000000.0,41.251,0.145,2.744,2182.0


In [33]:
# Statistical values of columns for first data frame
class_1_df.describe()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape2D_Elongation,original_shape2D_MajorAxisLength,original_shape2D_MaximumDiameter,original_shape2D_MeshSurface,original_shape2D_MinorAxisLength,original_shape2D_Perimeter,...,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,patient
count,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,...,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0
mean,83.254,254.627,261121.0,1.0,1.0,590.051,721.956,261120.5,590.051,2042.828,...,0.066,5.41,0.334,82.65,390.679,1463.058,18.539,0.021,0.007,965.213
std,31.695,9.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033,0.571,0.106,1060.684,131.252,38235.935,5.964,0.013,0.009,809.069
min,0.0,0.0,261121.0,1.0,1.0,590.051,721.956,261120.5,590.051,2042.828,...,0.0,-0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,60.414,255.0,261121.0,1.0,1.0,590.051,721.956,261120.5,590.051,2042.828,...,0.041,5.043,0.261,6.346,304.319,0.0,14.195,0.01,0.004,75.0
50%,79.691,255.0,261121.0,1.0,1.0,590.051,721.956,261120.5,590.051,2042.828,...,0.06,5.449,0.326,13.707,388.584,0.0,17.797,0.018,0.005,692.0
75%,100.869,255.0,261121.0,1.0,1.0,590.051,721.956,261120.5,590.051,2042.828,...,0.088,5.796,0.395,31.828,469.678,0.0,22.315,0.029,0.007,1869.0
max,248.045,255.0,261121.0,1.0,1.0,590.051,721.956,261120.5,590.051,2042.828,...,0.232,7.725,0.682,29501.387,981.509,1000000.0,43.676,0.085,0.263,2182.0


Calculate means for every column of both data frames in order to compare them

In [34]:
df0_mean = class_0_df.describe().loc['mean']
df1_mean = class_1_df.describe().loc['mean']

mean_df = pd.DataFrame({'Column Name': df_analysis.columns, 'mean_0': df0_mean, 'mean_1': df1_mean})
mean_df = mean_df.reset_index(drop=True)

In [35]:
# Initialize the column with zeros
mean_df['Difference_%'] = 0  

# Calculating absolute differences because there are negative values
condition = mean_df['mean_1'] > mean_df['mean_0']
mean_df.loc[condition, 'Difference_%'] = abs(mean_df['mean_1'] / mean_df['mean_0'] * 100)

condition = mean_df['mean_0'] > mean_df['mean_1']
mean_df.loc[condition, 'Difference_%'] = abs(mean_df['mean_0'] / mean_df['mean_1'] * 100)

In [36]:
sorted_mean_df = mean_df.sort_values('Difference_%', ascending=False)
sorted_mean_df.head(30)

Unnamed: 0,Column Name,mean_0,mean_1,Difference_%
23,original_firstorder_Minimum,14.126,0.014,101631.579
91,original_glszm_LargeAreaLowGrayLevelEmphasis,2.985,412.895,13833.479
75,original_glrlm_LongRunLowGrayLevelEmphasis,2.99,412.895,13808.408
89,original_glszm_LargeAreaEmphasis,24.856,485.771,1954.319
73,original_glrlm_LongRunEmphasis,24.909,485.771,1950.202
105,original_ngtdm_Strength,0.066,0.007,1011.617
101,original_ngtdm_Busyness,48.492,390.679,805.649
27,original_firstorder_Skewness,-0.098,0.778,791.064
59,original_gldm_GrayLevelNonUniformity,7338.563,52882.042,720.605
10,original_shape2D_PerimeterSurfaceRatio,0.049,0.008,630.271
