In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from utils import *
from ml_utils import XrayTF
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import numpy as np
import os

In [5]:
GENDER   = "M"
POSITION = "PA"
csv_file = csv_file = "../sheet/normalized_xray_data_without_no_finding.csv"


# take existing if it exist
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
    xray_class = XrayTF(df)
else:
    df = get_data_sheet()
    df = normalize_data_frame(df)
    xray_class = XrayTF(df)
    xray_class.prepend_image_full_path()
    xray_class.df.to_csv(csv_file, index=False)

xray_class.df.head()

Unnamed: 0,Image Index,labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position
0,D:\xray\images\Cardiomegaly\00000001_000.png,Cardiomegaly,0,1,58,M,PA
1,D:\xray\images\Infiltration\00000005_006.png,Infiltration,6,5,70,F,PA
2,D:\xray\images\Cardiomegaly\00000008_000.png,Cardiomegaly,0,8,69,F,PA
3,D:\xray\images\Nodule\00000008_002.png,Nodule,2,8,73,F,PA
4,D:\xray\images\Emphysema\00000009_000.png,Emphysema,0,9,73,M,PA


In [6]:
# let's balance the data
"""
xray_class.balance_all_labels()
"""

# Verify how balanced our data is
print(xray_class.df.labels.value_counts())
print(xray_class.df["Patient Gender"].value_counts())
print(xray_class.df["View Position"].value_counts())

Infiltration          19894
Effusion              13317
Atelectasis           11559
Nodule                 6331
Mass                   5782
Pneumothorax           5302
Consolidation          4667
Pleural_Thickening     3385
Cardiomegaly           2776
Emphysema              2516
Edema                  2303
Fibrosis               1686
Pneumonia              1431
Name: labels, dtype: int64
M    46281
F    34668
Name: Patient Gender, dtype: int64
PA    42136
AP    38813
Name: View Position, dtype: int64


In [None]:
# decide which one we want to take in terms of relating to other features.

"""

"""
xray_class.df = xray_class.df[xray_class.df["Patient Gender"] == GENDER]
xray_class.df = xray_class.df[xray_class.df["View Position"] == POSITION]

In [None]:
# Split them into training and validation using NUM_IMAGES 
train_df, valid_df = xray_class.get_test_train_split_data(len(xray_class.df))

In [None]:
# Create training and validation data batches using ImageDataGenerator
"""
train_data = xray_class.create_data_batches(X_train, y_train)
val_data = xray_class.create_data_batches(X_val, y_val, valid_data=True)
train_data.element_spec, val_data.element_spec
"""

train_data = xray_class.generate_image(train_df)
val_data   = xray_class.generate_image(valid_df)

In [None]:
"""
train_images, train_labels = next(train_data.as_numpy_iterator())
xray_class.show_25_images(train_images, train_labels)
"""

train_images, train_labels = next(train_data)
xray_class.show_25_images(train_images, train_labels)

In [None]:
model = xray_class.train_model(5, train_data, val_data) #(number of epoc, early stopping)
xray_class.save_model(model, "all_sickness")

In [None]:
# Make predictions on the validation data (not used to train on)
predictions = model.predict(val_data, verbose=1) # verbose shows us how long there is to go

In [None]:
# Let's check a few predictions and their different values
val_images, val_labels = next(val_data)

i_multiplier = 0
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(5*2*num_cols, 5*num_rows))
for i in range(num_images):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  xray_class.plot_pred(prediction_probabilities=predictions,
            labels=val_labels,
            images=val_images,
            n=i+i_multiplier)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  xray_class.plot_pred_conf(prediction_probabilities=predictions,
                labels=val_labels,
                n=i+i_multiplier)
plt.tight_layout(h_pad=1.0)
plt.show()