In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.utils import class_weight
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from tensorflow.keras import backend as K
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os, shutil

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
base_dir = '/content/drive/My Drive/chest-xrays'

# training set
train_dir = os.path.join(base_dir, 'train')
train_normal = os.path.join(train_dir, 'NORMAL')
train_pneumonia = os.path.join(train_dir, 'PNEUMONIA')

# validation set
val_dir = os.path.join(base_dir, 'val')
val_normal = os.path.join(val_dir, 'NORMAL')
val_pneumonia = os.path.join(val_dir, 'PNEUMONIA')

# test set
test_dir = os.path.join(base_dir, 'test')

In [None]:
# count images per set
print('Total training images:', len(os.listdir(train_normal) + os.listdir(train_pneumonia)))
print('Total validation images:', len(os.listdir(val_normal) + os.listdir(val_pneumonia)))
print('Total test images:', len(os.listdir(test_dir)))

Total training images: 5226
Total validation images: 16
Total test images: 624


In [None]:
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split

def get_all_image_paths(base_dir):
    image_paths = []
    labels = []

    for split in ['train', 'val']:
        for class_name in ['NORMAL', 'PNEUMONIA']:
            class_dir = os.path.join(base_dir, split, class_name)
            for fname in os.listdir(class_dir):
                full_path = os.path.join(class_dir, fname)
                if os.path.isfile(full_path):
                    image_paths.append(full_path)
                    labels.append(class_name)
    return np.array(image_paths), np.array(labels)

def reorganize_split_dataset(image_paths, labels, output_base, test_size=0.2):
    x_train, x_val, y_train, y_val = train_test_split(
        image_paths, labels, test_size=test_size, stratify=labels, random_state=42
    )

    for subset, x, y in [('train', x_train, y_train), ('val', x_val, y_val)]:
        for src, label in zip(x, y):
            dest_dir = os.path.join(output_base, subset, label)
            os.makedirs(dest_dir, exist_ok=True)
            dest_path = os.path.join(dest_dir, os.path.basename(src))
            shutil.copy2(src, dest_path)

# paths
original_data = '/content/drive/My Drive/chest-xrays'
output_data = '/content/drive/My Drive/chest-xrays-split'

# run
paths, labels = get_all_image_paths(original_data)
reorganize_split_dataset(paths, labels, output_data)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_gen = ImageDataGenerator(rescale=1./255,
                               rotation_range=20,
                               width_shift_range=0.1,
                               height_shift_range=0.1,
                               zoom_range=0.1,
                               horizontal_flip=True).flow_from_directory(
    '/content/drive/My Drive/chest-xrays-split/train',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    shuffle=True,
)

val_gen = ImageDataGenerator(rescale=1./255).flow_from_directory(
    '/content/drive/My Drive/chest-xrays-split/val',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)


Found 4193 images belonging to 2 classes.
Found 1049 images belonging to 2 classes.


In [None]:
import pandas as pd

img_dir = '/content/drive/My Drive/chest-xrays/test'
filenames = os.listdir(img_dir)
df = pd.DataFrame({'filename': filenames})

test_gen = ImageDataGenerator(rescale=1./255).flow_from_dataframe(
    dataframe=df,
    directory=img_dir,
    x_col='filename',
    y_col=None,
    target_size=(224, 224),
    batch_size=32,
    class_mode=None,
    shuffle=False
)

Found 624 validated image filenames.


In [None]:
from tensorflow.keras import layers, models
import tensorflow as tf

# Initialize a pre-trained model
pretrained_model = tf.keras.applications.ResNet50V2(
    include_top=False,
    weights='imagenet',
    input_shape=(224, 224, 3)
    )

# We don't need to train the pre-trained model — we just want to fine-tune it later
pretrained_model.trainable = False

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras import regularizers

transfer_model = Sequential()
transfer_model.add(pretrained_model)
transfer_model.add(GlobalAveragePooling2D())
transfer_model.add(Dense(128, activation='relu'))
transfer_model.add(Dropout(0.5))
transfer_model.add(Dense(1, activation='sigmoid'))
transfer_model.summary()

transfer_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# 1e-3: accuracy & loss increasing
# 1e-2: loss still increasing, accuracy flat

BATCH_SIZE = 32

# Calculate steps_per_epoch to ensure complete batches
#steps_per_epoch = len(x_train_balanced) // BATCH_SIZE

from sklearn.utils.class_weight import compute_class_weight

class_labels = train_gen.classes
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(class_labels),
    y=class_labels
)

# Convert to dictionary
class_weights_dict = dict(enumerate(class_weights))

from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint(
    'best_model.weights.h5',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=True
)

history = transfer_model.fit(
    train_gen,
    epochs=5,
    validation_data=val_gen,
    class_weight=class_weights_dict,
    callbacks=[early_stop, checkpoint]
)

  self._warn_if_super_not_called()


Epoch 1/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1167s[0m 9s/step - accuracy: 0.8354 - loss: 0.4483 - val_accuracy: 0.8665 - val_loss: 0.3314
Epoch 2/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1060s[0m 8s/step - accuracy: 0.9176 - loss: 0.2152 - val_accuracy: 0.8541 - val_loss: 0.3612
Epoch 3/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1096s[0m 8s/step - accuracy: 0.9176 - loss: 0.2183 - val_accuracy: 0.9399 - val_loss: 0.1420
Epoch 4/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1062s[0m 8s/step - accuracy: 0.9339 - loss: 0.1752 - val_accuracy: 0.9552 - val_loss: 0.1297
Epoch 5/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1062s[0m 8s/step - accuracy: 0.9396 - loss: 0.1723 - val_accuracy: 0.9209 - val_loss: 0.1897


In [None]:
# Unfreeze some of the deeper convolutional layers
for layer in pretrained_model.layers[-10:]:
    layer.trainable = True

from tensorflow.keras.optimizers import Adam

transfer_model.compile(optimizer=Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy'])

history_ft = transfer_model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=5,
    callbacks=[early_stop, checkpoint],
    class_weight=class_weights_dict
)

Epoch 1/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1208s[0m 9s/step - accuracy: 0.9269 - loss: 0.2646 - val_accuracy: 0.8856 - val_loss: 0.2819
Epoch 2/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1172s[0m 9s/step - accuracy: 0.9395 - loss: 0.1891 - val_accuracy: 0.9190 - val_loss: 0.2057
Epoch 3/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1252s[0m 9s/step - accuracy: 0.9428 - loss: 0.1559 - val_accuracy: 0.9314 - val_loss: 0.1804
Epoch 4/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1227s[0m 9s/step - accuracy: 0.9462 - loss: 0.1490 - val_accuracy: 0.9380 - val_loss: 0.1628
Epoch 5/5
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1287s[0m 9s/step - accuracy: 0.9441 - loss: 0.1556 - val_accuracy: 0.9571 - val_loss: 0.1367


In [None]:
import pandas as pd
import numpy as np
import os
transfer_model.load_weights('best_model.weights.h5')
# Get predicted class labels
y_pred_probs = transfer_model.predict(test_gen)
y_pred = (y_pred_probs > 0.5).astype(int).ravel()

# Map numeric predictions to class labels
label_map = {0: 'NORMAL', 1: 'PNEUMONIA'}
y_labels = [label_map[p] for p in y_pred]

# Get corresponding filenames
file_names = [os.path.basename(path) for path in test_gen.filenames]

# Create DataFrame
submission_df = pd.DataFrame({
    'Id': file_names,
    'class': y_labels
})

# Save to CSV
submission_df.to_csv('submission7.csv', index=False)

  saveable.load_own_variables(weights_store.get(inner_path))


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 8s/step


In [None]:
from google.colab import files
files.download('submission7.csv')

# submission3 = class weights w/ 132 steps per epoch
# submission6 = fine-tuning

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# final accuracy
acc = history_fine.history['accuracy']
print('Training accuracy:', acc[-1])
val_acc = history_fine.history['val_accuracy']
print('Validation accuracy:', val_acc[-1])

In [None]:
# accuracy plots
epochs = range(1, len(acc) + 1)
plt.figure()
plt.plot(epochs, acc, label='Training Accuracy')
plt.plot(epochs, val_acc, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()