## Dataset Analysis

In [1]:
import os
import pandas as pd

Read extracted features dataset into a data frame

In [2]:
data_dir = os.path.join(os.getcwd(), "data", "extracted_features.csv")
data_df = pd.read_csv(data_dir)

# Shape of data frame
print('Data frame dimensions are:', data_df.shape)

Data frame dimensions are: (2734, 128)


### Preprocessing

Drop columns that have only 1 unique value

In [3]:
drop_columns = []
count = 0

for column in data_df.columns:
    unique_values = data_df[column].nunique()
    if unique_values == 1:
        drop_columns.append(column)
        count += 1

print("There are", count, "columns with unique values.")
print("These columns are:", drop_columns)

There are 13 columns with unique values.
These columns are: ['diagnostics_Versions_PyRadiomics', 'diagnostics_Versions_Numpy', 'diagnostics_Versions_SimpleITK', 'diagnostics_Versions_PyWavelet', 'diagnostics_Versions_Python', 'diagnostics_Configuration_Settings', 'diagnostics_Configuration_EnabledImageTypes', 'diagnostics_Image-original_Dimensionality', 'diagnostics_Image-original_Spacing', 'diagnostics_Image-original_Size', 'diagnostics_Image-original_Minimum', 'diagnostics_Mask-original_Spacing', 'diagnostics_Mask-original_Size']


In [4]:
data_df.drop(columns = drop_columns, inplace = True)

# Shape of data frame
print('New data frame dimensions are:', data_df.shape)

New data frame dimensions are: (2734, 115)


### Analysis

Remove all categorical values as they are not important for further numerical analysis

In [5]:
categorical_feats = data_df.dtypes[data_df.dtypes == "object"].index
categorical_column_names = list(categorical_feats)

print("Number of categorical features:", len(categorical_column_names))
print("Categorical column names:", categorical_column_names)

Number of categorical features: 7
Categorical column names: ['diagnostics_Image-original_Hash', 'diagnostics_Mask-original_Hash', 'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_CenterOfMassIndex', 'diagnostics_Mask-original_CenterOfMass', 'name', 'provider']


In [6]:
df_analysis = data_df.drop(columns = categorical_column_names)

# Drop class column as well
df_analysis.drop(columns = ["class"], inplace = True)

Split data frame into 2 smaller data frames based on the image class

In [7]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

class_0_df = df_analysis[data_df['class'] == 0]
class_1_df = df_analysis[data_df['class'] == 1]

In [8]:
# Statistical values of columns for first data frame
class_0_df.describe()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape2D_Elongation,original_shape2D_MajorAxisLength,original_shape2D_MaximumDiameter,original_shape2D_MeshSurface,original_shape2D_MinorAxisLength,original_shape2D_Perimeter,...,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,patient
count,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,...,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0
mean,94.75173,255.0,50523.69276,1.1763,0.66297,295.31467,312.0153,50523.14082,190.6098,923.09483,...,0.03239,5.31181,0.38721,16.27916,48.49247,2926.11713,15.75089,0.03372,0.06578,965.21287
std,27.41308,0.0,56971.95886,0.57045,0.19125,161.71384,175.18532,56971.98106,121.92897,555.93381,...,0.03188,0.52912,0.09348,43.64894,71.14318,54034.14899,4.7758,0.01439,0.11986,809.06866
min,20.2276,255.0,2.0,1.0,0.0,4.6188,3.60555,1.0,0.0,5.65685,...,0.00745,0.9183,0.06507,0.0,0.0,4e-05,0.0,0.0,0.0,1.0
25%,76.12902,255.0,10580.0,1.0,0.54619,158.52743,164.5083,10579.5,92.72815,456.71782,...,0.01839,5.03637,0.33275,4.35647,10.56728,0.00035,13.13699,0.02307,0.01558,75.0
50%,93.43042,255.0,30920.0,1.0,0.6899,280.81359,294.26689,30920.0,163.86956,846.44069,...,0.02527,5.37098,0.38007,7.22535,24.9301,0.00071,15.55078,0.03267,0.03306,692.0
75%,111.94511,255.0,69037.0,1.0,0.80404,409.26679,427.29306,69036.25,261.85411,1272.64527,...,0.03731,5.6531,0.44122,14.001,55.38842,0.00175,18.3309,0.0424,0.07469,1869.0
max,224.35402,255.0,262143.0,8.0,1.0,748.14703,723.37058,262142.5,591.19994,4339.16392,...,0.625,6.9402,1.0,847.98978,575.79211,1000000.0,41.25065,0.14526,2.74366,2182.0


In [9]:
# Statistical values of columns for first data frame
class_1_df.describe()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape2D_Elongation,original_shape2D_MajorAxisLength,original_shape2D_MaximumDiameter,original_shape2D_MeshSurface,original_shape2D_MinorAxisLength,original_shape2D_Perimeter,...,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,patient
count,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,...,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0,1367.0
mean,83.25425,254.62692,261121.0,1.0,1.0,590.05085,721.95637,261120.5,590.05085,2042.82843,...,0.06628,5.40985,0.33383,82.6497,390.67918,1463.05788,18.53905,0.02082,0.0065,965.21287
std,31.6952,9.75016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.03348,0.5712,0.10594,1060.684,131.25159,38235.93517,5.96445,0.01303,0.00928,809.06866
min,0.0,0.0,261121.0,1.0,1.0,590.05085,721.95637,261120.5,590.05085,2042.82843,...,0.0,-0.0,0.00196,0.0,0.0,3e-05,0.0,0.0,0.0,1.0
25%,60.41353,255.0,261121.0,1.0,1.0,590.05085,721.95637,261120.5,590.05085,2042.82843,...,0.04082,5.0425,0.26107,6.346,304.31895,5e-05,14.19498,0.01044,0.00392,75.0
50%,79.69069,255.0,261121.0,1.0,1.0,590.05085,721.95637,261120.5,590.05085,2042.82843,...,0.05964,5.44872,0.32645,13.70706,388.58358,7e-05,17.79668,0.0182,0.0053,692.0
75%,100.86866,255.0,261121.0,1.0,1.0,590.05085,721.95637,261120.5,590.05085,2042.82843,...,0.08752,5.79627,0.39544,31.82816,469.67781,0.0001,22.31549,0.02885,0.00737,1869.0
max,248.04543,255.0,261121.0,1.0,1.0,590.05085,721.95637,261120.5,590.05085,2042.82843,...,0.23189,7.72474,0.68178,29501.38738,981.50908,1000000.0,43.67643,0.08535,0.26293,2182.0


Calculate means for every column of both data frames in order to compare them

In [10]:
df0_mean = class_0_df.describe().loc['mean']
df1_mean = class_1_df.describe().loc['mean']

mean_df = pd.DataFrame({'Column Name': df_analysis.columns, 'mean_0': df0_mean, 'mean_1': df1_mean})
mean_df = mean_df.reset_index(drop=True)

In [11]:
# Initialize the column with zeros
mean_df['Difference_%'] = 0  

# Calculating absolute differences because there are negative values
condition = mean_df['mean_1'] > mean_df['mean_0']
mean_df.loc[condition, 'Difference_%'] = abs(mean_df['mean_1'] / mean_df['mean_0'] * 100)

condition = mean_df['mean_0'] > mean_df['mean_1']
mean_df.loc[condition, 'Difference_%'] = abs(mean_df['mean_0'] / mean_df['mean_1'] * 100)

In [12]:
sorted_mean_df = mean_df.sort_values('Difference_%', ascending=False)
sorted_mean_df.head(30)

Unnamed: 0,Column Name,mean_0,mean_1,Difference_%
23,original_firstorder_Minimum,14.12582,0.0139,101631.57895
91,original_glszm_LargeAreaLowGrayLevelEmphasis,2.98475,412.89468,13833.47887
75,original_glrlm_LongRunLowGrayLevelEmphasis,2.99017,412.89468,13808.40805
89,original_glszm_LargeAreaEmphasis,24.85629,485.77108,1954.31862
73,original_glrlm_LongRunEmphasis,24.90876,485.77108,1950.20189
105,original_ngtdm_Strength,0.06578,0.0065,1011.61688
101,original_ngtdm_Busyness,48.49247,390.67918,805.64922
27,original_firstorder_Skewness,-0.09831,0.77773,791.06352
59,original_gldm_GrayLevelNonUniformity,7338.56317,52882.04248,720.60485
10,original_shape2D_PerimeterSurfaceRatio,0.04931,0.00782,630.27097
