# Train a model for classifying tissue samples into benign vs malign

## Import needed libraries

In [18]:
# download from urls
import urllib.request


import os
import tarfile
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split
# tf
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout


## Get the data ready

### Download the images

In [19]:
# Download the 56 zip files in Images_png in batches
# URLs for the zip files
links = [
    'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz',
    'https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz',
    'https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz',
	'https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz',
    'https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz',
	'https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz',
	'https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz',
    'https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz',
	'https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz',
	'https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz',
	'https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz',
	'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
]


for idx, link in enumerate(links):
    fn = '/mnt/Data2/CXR8/imgs/images_%02d.tar.gz' % (idx+1)
    print('downloading'+fn+'...')
    urllib.request.urlretrieve(link, fn)  # download the zip file


print("Download complete. Please check the checksums")

downloading/mnt/Data2/CXR8/imgs/images_01.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_02.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_03.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_04.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_05.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_06.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_07.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_08.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_09.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_10.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_11.tar.gz...
downloading/mnt/Data2/CXR8/imgs/images_12.tar.gz...
Download complete. Please check the checksums


### Define the paths

In [20]:
# Paths
data_dir = '/mnt/Data2/CXR8/imgs/'
csv_path = '/mnt/Data2/CXR8/Data_Entry_2017_v2020.csv'
output_dir = '/mnt/Data2/CXR8/processed/'

### Unpack the files

In [21]:
# Unpack all .tar.gz files into individual subdirectories
tar_files = [f for f in os.listdir(data_dir) if f.endswith('.tar.gz')]
for tar_file in tar_files:
    tar_path = os.path.join(data_dir, tar_file)
    subdir = os.path.join(data_dir, tar_file[:-7])  # Create subdirectory based on tar file name
    os.makedirs(subdir, exist_ok=True)
    with tarfile.open(tar_path, 'r:gz') as tar:
        tar.extractall(path=subdir)
    print(f'Unpacked {tar_file} into {subdir}')

Unpacked images_01.tar.gz into /mnt/Data2/CXR8/imgs/images_01
Unpacked images_02.tar.gz into /mnt/Data2/CXR8/imgs/images_02
Unpacked images_03.tar.gz into /mnt/Data2/CXR8/imgs/images_03
Unpacked images_04.tar.gz into /mnt/Data2/CXR8/imgs/images_04
Unpacked images_05.tar.gz into /mnt/Data2/CXR8/imgs/images_05
Unpacked images_06.tar.gz into /mnt/Data2/CXR8/imgs/images_06


KeyboardInterrupt: 

In [None]:
# Create directories for processed data
os.makedirs(os.path.join(output_dir, 'train/normal'), exist_ok=True)
os.makedirs(os.path.join(output_dir, 'train/pneumonia'), exist_ok=True)
os.makedirs(os.path.join(output_dir, 'test/normal'), exist_ok=True)
os.makedirs(os.path.join(output_dir, 'test/pneumonia'), exist_ok=True)

### Load the csv file for identifying the labels for normal vs pneumonia

In [None]:
# Load CSV
df = pd.read_csv(csv_path)

### filter out for Pneumonia vs Normal

In [None]:
# Filter images with labels 'normal' and 'pneumonia'
filtered_df = df[df['Finding Labels'].isin(['No Finding', 'Pneumonia'])]
# Map 'No Finding' to 'normal' and 'Pneumonia' to 'pneumonia'
filtered_df['label'] = filtered_df['Finding Labels'].map({'No Finding': 'normal', 'Pneumonia': 'pneumonia'})

### Split into train and test

In [None]:
# Split into train and test sets
train_df, test_df = train_test_split(filtered_df, test_size=0.2, stratify=filtered_df['label'], random_state=42)

### copy images to corresponding folders

In [None]:
# Function to copy images to their respective directories
def copy_images(df, split):
    for _, row in df.iterrows():
        label = row['label']
        image_path = os.path.join(image_dir, row['Image Index'])
        if os.path.exists(image_path):
            shutil.copy(image_path, os.path.join(output_dir, split, label, row['Image Index']))

# Copy images to train and test directories
copy_images(train_df, 'train')
copy_images(test_df, 'test')

print("Images have been successfully filtered and organized.")

## Get the model ready

### Parameters

In [None]:
# Parameters for training
img_height, img_width = 224, 224
batch_size = 32
epochs = 10

### Generate the data for the DL model training

In [None]:
# Data generators
train_datagen = ImageDataGenerator(rescale=1.0/255.0)
test_datagen = ImageDataGenerator(rescale=1.0/255.0)


train_generator = train_datagen.flow_from_directory(
    os.path.join(output_dir, 'train'),
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary'
)


test_generator = test_datagen.flow_from_directory(
    os.path.join(output_dir, 'test'),
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary'
)

### Model building