In [1]:
# Group all `import`s here at a later date.

# Data Cleaning & Checking

## Importing, checking and cleaning the (non-image) data

First, import the csv file for non-image features and take a look at it.

In [2]:
import pandas as pd

In [3]:
raw_data_path = "../raw_data/Data_Entry_2017.csv"
bb_data_path = "../raw_data/BBox_List_2017.csv"
data = pd.read_csv(raw_data_path)
data_bb = pd.read_csv(bb_data_path)

FileNotFoundError: [Errno 2] No such file or directory: '../raw_data/Data_Entry_2017.csv'

In [None]:
data_bb.head()

In [None]:
data_bb.drop(data_bb.columns[-3:], axis=1, inplace=True)
data_bb.columns = ["img_idx", "label", "bb_x", "bb_y", "bb_w", "bb_h"]

In [None]:
data_bb.head()

In [None]:
data_bb.img_idx.nunique()

In [None]:
data_bb.shape

**Note:** We will forget about the BB data for the time being and move on to the main CSV.

In [None]:
data.shape

In [None]:
data.head()

Drop the columns we don't need and set shorter column names.

In [None]:
data.drop(data.columns[-1], axis=1, inplace=True)
data.drop(columns=["Follow-up #", "Patient ID"], inplace=True)
data.columns = ["img_idx", "labels", "age", "gender", "view_pos", "img_w", "img_h", "img_pix_spc_x", "img_pix_spc_y"]

In [None]:
data.shape

In [None]:
data.head()

Take a look at data statistics, cardinality, types, etc.

In [None]:
data.info()

In [None]:
data.isnull().values.any()

It appears that we do not have any `null`s, which is good.  
**_To do:_** Consider downcasting data types to save memory.  
For example:  
```python
data["age"] = pd.to_numeric(data["age"], downcast=’uint8’) # 0-255
```
Binary columns can be `bool` and image specs can be smaller variants of `int`.

In [None]:
data.nunique()

In [None]:
data.describe()

Maximum age of `414` does not make sense. There appears to be an issue with the `age` column.

In [None]:
# import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.boxenplot(data.age, orient="horizontal");

In [None]:
data = data[data.age < 125] # The oldest person to ever live was ~122 years old when they passed.

In [None]:
sns.boxenplot(data.age, orient="horizontal");

In [None]:
data.shape

Check for duplicates.

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.shape

In [None]:
data.tail()

Visualize the data.

In [None]:
data.hist(figsize=(10,10));

In [None]:
# Split labels based on a delimiter 
data["labels"].str.split("|").explode("labels").value_counts().plot(kind="bar");

In [None]:
data["gender"].value_counts().plot(kind="bar");

In [None]:
data["view_pos"].value_counts().plot(kind="bar");

Next, convert and expand the `labels` column to corresponding individual columns with boolean values for each condition that we care about, plus an aggregate "other" column for all the other ones. We will keep the original `labels` column because the EDA notebook uses it. But we should remove it (or pass it through the pipeline without using it) during training.
Effectively, we are multi-label binary-encoding the labels (manually for now; in the future, we could use a [`MultiLabelBinarizer`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html) in our pipeline).

In [None]:
all_labels = list(
    pd.DataFrame(data["labels"].str.split("|").explode("labels")).labels.unique()
)

do_care_labels = [
    "Pneumonia",
    "Cardiomegaly",
    "Pleural_Thickening",
    "Consolidation",
    "Pneumothorax",
    "Effusion",
    "No Finding",
]
other_labels = [l for l in all_labels if l not in do_care_labels]

In [None]:
for l in all_labels:
    data[l] = data.labels.apply(lambda x: l in x)

In [None]:
data["Other"] = False
for l in other_labels:
    data["Other"] = data["Other"] | data[l]

In [None]:
data.drop(columns=other_labels, inplace=True)
for l in do_care_labels + ["Other"]:
    data.rename(columns={l: "label_"+l.lower().replace(" ", "_")}, inplace=True)

In [None]:
data

Actually, we don't care about the colums related to the dimensions and "pixel spacing" of the  original images so we will now remove them.

In [None]:
data.drop(columns=["img_w", "img_h", "img_pix_spc_x", "img_pix_spc_y"], inplace=True)
data.head()

Save the cleaned data to disk.

In [None]:
clean_data_path = "../clean_data/cleaned_data.csv"
data.to_csv(clean_data_path, index=False)

# Resizing (Downsizing) the Images

Saving the down-sized version of all images compatible with ResNet50 (224x224)

In [None]:
import os, pathlib
from PIL import Image

new_height = 224
new_width = 224

cwd = os.getcwd()
raw_images_dir = pathlib.Path(cwd, "..", "raw_data", "images")
out_images_dir = pathlib.Path(cwd, "..", "raw_data", "images_224_224")

for filename in os.listdir(raw_images_dir):
    if filename.endswith('.png'):
        img = Image.open(os.path.join(raw_images_dir, filename))
        resized_img = img.resize((new_height, new_width))
        resized_img.save(os.path.join(out_images_dir, filename))