## Dataset Analysis: Simple JPEG set (No bounding boxes.)

In [2]:
# print distributions
# get global stddev and mean
# get min/max/mean/median height-width to help inform size for CNN

In [1]:
import os
from PIL import Image
base_dir = "/ssd/datasets/chest_xray"

# Using the dir structure of dataset/classification, create a metadata
# representation in json format.
metadata = []
classifications = ["NORMAL", "PNEUMONIA"]
for dataset_dir in ['val', 'test', 'train']:
    for classification in os.listdir(os.path.join(base_dir, dataset_dir)):
        for image in os.listdir(os.path.join(base_dir, dataset_dir, classification)):
            if ".jpeg" in image:
                metadata.append({
                    "image": os.path.join(dataset_dir, classification, image),
                    "set": dataset_dir,
                    "label": classifications.index(classification),
                })
                
def add_dims(record: dict) -> dict:
    w, h = Image.open(os.path.join(base_dir, record["image"])).convert("L").size
    record["width"] = w
    record["height"] = h
    return record
    
                
metadata = [add_dims(record) for record in metadata]

In [13]:
# dump to file
import json
with open(os.path.join(base_dir, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=2)

In [14]:
heights = [record["height"] for record in metadata]
widths = [record["width"] for record in metadata]

In [24]:
import numpy as np
print(f"Height:\nMax: {max(heights)}, Min: {min(heights)}, Mean: {np.mean(heights)}, Median: {np.median(heights)}")

Height:
Max: 2713, Min: 127, Mean: 970.6890368852459, Median: 888.0


In [26]:
print(f"Width:\nMax: {max(widths)}, Min: {min(widths)}, Mean: {np.mean(widths)}, Median: {np.median(widths)}")

Width:
Max: 2916, Min: 384, Mean: 1327.880806010929, Median: 1281.0


In [28]:
# we should try with a resize of 256x256 and move it up as a parameter tuner.
# next, get global stddev and mean.
stds = []
means = []
from tqdm import tqdm
for record in tqdm(metadata):
    image = Image.open(os.path.join(base_dir, record["image"])).convert("L")
    image = np.array(image)
    stds.append(np.std(np.divide(image, 255)))
    means.append(np.mean(np.divide(image, 255)))
print(f"Global Std-dev: {np.std(stds)}. Global Mean: {np.mean(means)}")


100%|██████████| 5856/5856 [01:14<00:00, 78.78it/s] 

Global Std-dev: 0.03684682087550211. Global Mean: 0.4815147875163741





In [30]:
global_std = np.std(stds)
global_mean = np.mean(means)
with open(os.path.join(base_dir, "global_params.json"), "w") as f:
    json.dump({
        "channel_mean": global_mean, 
        "channel_std": global_std,
        "input_width": 256,
        "input_height": 256
    }, f, indent=2)

In [3]:
# Distributions:
train_normal = [record for record in metadata if record["set"] == "train" and record["label"] == 0]
train_pneu = [record for record in metadata if record["set"] == "train" and record["label"] == 1]
print(f"Training\nNormal: {len(train_normal)}\tPneumonia: {len(train_pneu)}")

Training
Normal: 1341	Pneumonia: 3875


In [4]:
test_normal = [record for record in metadata if record["set"] == "test" and record["label"] == 0]
test_pneu = [record for record in metadata if record["set"] == "test" and record["label"] == 1]
print(f"Testing\nNormal: {len(test_normal)}\tPneumonia: {len(test_pneu)}")

Testig
Normal: 234	Pneumonia: 390


In [5]:
234/(390 + 234)

0.375

In [6]:
390/(390+294)

0.5701754385964912

In [None]:
# by saying everyone has pneumonia we can be 57% correct.