In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

##Import any other packages you may need here

EDA is open-ended, and it is up to you to decide how to look at different ways to slice and dice your data. A good starting point is to look at the requirements for the FDA documentation in the final part of this project to guide (some) of the analyses you do. 

This EDA should also help to inform you of how pneumonia looks in the wild. E.g. what other types of diseases it's commonly found with, how often it is found, what ages it affects, etc. 

Note that this NIH dataset was not specifically acquired for pneumonia. So, while this is a representation of 'pneumonia in the wild,' the prevalence of pneumonia may be different if you were to take only chest x-rays that were acquired in an ER setting with suspicion of pneumonia. 

Perform the following EDA:
* The patient demographic data such as gender, age, patient position,etc. (as it is available)
* The x-ray views taken (i.e. view position)
* The number of cases including: 
    * number of pneumonia cases,
    * number of non-pneumonia cases
* The distribution of other diseases that are comorbid with pneumonia
* Number of disease per patient 
* Pixel-level assessments of the imaging data for healthy & disease states of interest (e.g. histograms of intensity values) and compare distributions across diseases.

Note: use full NIH data to perform the first a few EDA items and use `sample_labels.csv` for the pixel-level assassements. 

Also, **describe your findings and how will you set up the model training based on the findings.**

In [2]:
## Below is some helper code to read data for you.
## Load NIH data
all_xray_df = pd.read_csv('/data/Data_Entry_2017.csv')
all_xray_df.sample(3)

## Load 'sample_labels.csv' data for pixel level assessments
sample_df = pd.read_csv('sample_labels.csv')
sample_df.sample(3)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
4178,00020482_061.png,Edema|Infiltration|Pneumonia,61,20482,029Y,F,AP,2500,2048,0.168,0.168
4067,00020065_008.png,No Finding,8,20065,064Y,M,PA,3056,2544,0.139,0.139
4853,00025514_008.png,Nodule,8,25514,059Y,M,PA,2992,2991,0.143,0.143


In [4]:
## EDA
# Todo 
# The patient demographic data such as gender, age, patient position,etc. (as it is available)
# The x-ray views taken (i.e. view position)
# The number of cases including:
# number of pneumonia cases,
# number of non-pneumonia cases
# The distribution of other diseases that are comorbid with pneumonia
# Number of disease per patient
# Pixel-level assessments of the imaging data for healthy & disease states of interest (e.g. histograms of intensity values) and compare distributions across diseases.

In [6]:
all_xray_df.to_csv('Data_Entry_2017.csv')

In [7]:
# The patient demographic data such as gender, age, patient position,etc. (as it is available)
all_xray_df.describe()

Unnamed: 0,Follow-up #,Patient ID,Patient Age,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
count,112120.0,112120.0,112120.0,112120.0,112120.0,112120.0,112120.0,0.0
mean,8.573751,14346.381743,46.901463,2646.078844,2486.438842,0.155649,0.155649,
std,15.40632,8403.876972,16.839923,341.246429,401.268227,0.016174,0.016174,
min,0.0,1.0,1.0,1143.0,966.0,0.115,0.115,
25%,0.0,7310.75,35.0,2500.0,2048.0,0.143,0.143,
50%,3.0,13993.0,49.0,2518.0,2544.0,0.143,0.143,
75%,10.0,20673.0,59.0,2992.0,2991.0,0.168,0.168,
max,183.0,30805.0,414.0,3827.0,4715.0,0.1988,0.1988,


In [9]:
# The x-ray views taken (i.e. view position)
all_xray_df['View Position'].unique()

array(['PA', 'AP'], dtype=object)

In [24]:
# The number of cases including:
print(f'There are {len(all_xray_df)} cases overall')
# number of pneumonia cases,
# find rows in `all_xray_df` which contain "Pneumonia" 
pneumonia_df = all_xray_df[all_xray_df['Finding Labels'].str.contains('Pneumonia', regex=False)]
print(f'There are {len(pneumonia_df)} Pneumonia cases overall')
# number of non-pneumonia cases
non_pneumonia_df= all_xray_df[~all_xray_df['Finding Labels'].str.contains('Pneumonia', regex=False)]
print(f'There are {len(non_pneumonia_df)} cases overall')

There are 112120 cases overall
There are 1431 Pneumonia cases overall
There are 110689 cases overall


In [35]:
# The distribution of other diseases that are comorbid with pneumonia
disease_dist_dct = {}
for label in pneumonia_df['Finding Labels']:
    # split the data based on | separator
    for disease in label.split('|'):
        if disease not in disease_dist_dct:
            disease_dist_dct[disease] = 0
        else:
            disease_dist_dct[disease] += 1
print(disease_dist_dct)

{'Effusion': 268, 'Pneumonia': 1430, 'Pneumothorax': 40, 'Atelectasis': 261, 'Consolidation': 122, 'Edema': 339, 'Nodule': 69, 'Infiltration': 604, 'Pleural_Thickening': 47, 'Cardiomegaly': 40, 'Fibrosis': 10, 'Emphysema': 22, 'Mass': 70, 'Hernia': 2}


In [54]:
# Number of disease per patient
patient_id_grp = all_xray_df.groupby('Patient ID')
# Let's print the first entries
# in all the groups formed.
# patient_id_grp.first()
num_of_disease_per_patient = patient_id_grp['Finding Labels'].count()

In [55]:
num_of_disease_per_patient

Patient ID
1        3
2        1
3        8
4        1
5        8
        ..
30801    2
30802    1
30803    1
30804    1
30805    1
Name: Finding Labels, Length: 30805, dtype: int64

In [59]:
print(all_xray_df.shape)
all_xray_df.sample(3)

(112120, 12)


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
79530,00019535_000.png,No Finding,0,19535,30,F,AP,3056,2544,0.139,0.139,
88000,00021770_016.png,Infiltration|Pneumothorax,16,21770,7,F,AP,3056,2544,0.139,0.139,
52098,00013140_001.png,Atelectasis|Consolidation|Emphysema,1,13140,51,F,AP,2500,2048,0.168,0.168,


In [58]:
# Pixel-level assessments of the imaging data for healthy & disease states of interest (e.g. histograms of intensity values) and compare distributions across diseases.
print(sample_df.shape)
sample_df.sample(3)

(5606, 11)


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
3810,00018724_019.png,Edema|Effusion|Infiltration,19,18724,044Y,F,AP,2500,2048,0.168,0.168
4902,00025839_004.png,No Finding,4,25839,028Y,M,AP,3056,2544,0.139,0.139
4480,00022316_000.png,Infiltration,0,22316,020Y,M,PA,3056,2492,0.139,0.139


In [None]:
# DICOM Checking Steps:

# Preprocessing Steps:

# CNN Architecture:

# **Patient Population Description for FDA Validation Dataset:**
# * Types of augmentation used during training
# * Batch size
# * Optimizer learning rate
# * Layers of pre-existing architecture that were frozen
# * Layers of pre-existing architecture that were fine-tuned
# * Layers added to pre-existing architecture
# **Description of Training Dataset:** 


# **Description of Validation Dataset:** 
# **Ground Truth Acquisition Methodology:**

# **Algorithm Performance Standard:**