In [1]:
# Cell 1: Import libraries and set seeds for reproducibility.
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import pydicom
import cv2

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# Set seeds for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("✅ Libraries imported and seeds set!")


✅ Libraries imported and seeds set!


In [2]:
# Cell 2: Load dataset metadata and split data into training and validation sets.
dataset_path = "/kaggle/input/rsna-pneumonia-detection-challenge"
train_labels_csv = os.path.join(dataset_path, "stage_2_train_labels.csv")
class_info_csv = os.path.join(dataset_path, "stage_2_detailed_class_info.csv")

# Read CSV files
labels_df = pd.read_csv(train_labels_csv)
class_info_df = pd.read_csv(class_info_csv)

# Merge on 'patientId' and simplify labels (0: Normal, 1: Pneumonia)
merged_df = pd.merge(labels_df, class_info_df, on="patientId")
labels_simple = merged_df[['patientId', 'Target']].drop_duplicates().reset_index(drop=True)
labels_simple['Target'] = labels_simple['Target'].map({0: 'Normal', 1: 'Pneumonia'})
labels_simple['patientId'] = labels_simple['patientId'].astype(str) + ".dcm"

# Split data (80% train, 20% validation), stratified by target.
train_df, val_df = train_test_split(labels_simple, test_size=0.2, random_state=SEED, stratify=labels_simple['Target'])
print("Train samples:", len(train_df))
print("Validation samples:", len(val_df))


Train samples: 21347
Validation samples: 5337


In [3]:
# Cell 3: Define a function to load and preprocess DICOM images.
def load_preprocess_dicom(dicom_path, img_size=(240,240)):
    dicom_data = pydicom.dcmread(dicom_path)
    img_array = dicom_data.pixel_array.astype(np.float32)
    # Normalize pixel values to [0,1]
    img_norm = (img_array - np.min(img_array)) / (np.max(img_array) - np.min(img_array) + 1e-10)
    # Resize image
    img_resized = cv2.resize(img_norm, img_size)
    # Convert grayscale to 3-channel RGB
    img_rgb = np.stack([img_resized]*3, axis=-1)
    return img_rgb

# Test the function on a sample image
sample_image_path = os.path.join(dataset_path, "stage_2_train_images", train_df.iloc[0]['patientId'])
sample_img = load_preprocess_dicom(sample_image_path)
print("✅ Sample image shape (should be 240x240x3):", sample_img.shape)


✅ Sample image shape (should be 240x240x3): (240, 240, 3)


In [4]:
# Cell 4: Create data generators for training and validation.
def data_generator(df, batch_size=64, img_size=(240,240), infinite=True):
    def gen():
        if infinite:
            while True:
                shuffled_df = df.sample(frac=1).reset_index(drop=True)
                for _, row in shuffled_df.iterrows():
                    patient_id = row['patientId']
                    label = 1 if row['Target'] == 'Pneumonia' else 0
                    dicom_path = os.path.join(dataset_path, "stage_2_train_images", patient_id)
                    img = load_preprocess_dicom(dicom_path, img_size)
                    yield img, label
        else:
            for _, row in df.iterrows():
                patient_id = row['patientId']
                label = 1 if row['Target'] == 'Pneumonia' else 0
                dicom_path = os.path.join(dataset_path, "stage_2_train_images", patient_id)
                img = load_preprocess_dicom(dicom_path, img_size)
                yield img, label

    ds = tf.data.Dataset.from_generator(
        gen,
        output_types=(tf.float32, tf.int32),
        output_shapes=((img_size[0], img_size[1], 3), ())
    )
    ds = ds.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

BATCH_SIZE = 64
train_ds = data_generator(train_df, batch_size=BATCH_SIZE, img_size=(240,240), infinite=True)
val_ds = data_generator(val_df, batch_size=BATCH_SIZE, img_size=(240,240), infinite=False)
print("✅ Data generators created with batch size:", BATCH_SIZE)


✅ Data generators created with batch size: 64


In [5]:
# Cell 5: Compute class weights to mitigate class imbalance.
y_train = train_df['Target'].apply(lambda x: 1 if x == 'Pneumonia' else 0)
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: w for i, w in enumerate(weights)}
print("✅ Class weights computed:", class_weights)


✅ Class weights computed: {0: 0.6454314567333858, 1: 2.219022869022869}


In [6]:
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Input
from tensorflow.keras.models import Model

def build_best_model():
    # Best hyperparameters from tuning:
    unfreeze = True
    dense_units = 64
    dropout_rate = 0.3
    lr = 0.00010733251249694184
    
    # Load DenseNet201 with ImageNet weights without the top classifier
    base_model = DenseNet201(weights='imagenet', include_top=False, input_shape=(240,240,3))
    base_model.trainable = unfreeze  # Unfreeze the base if True
    
    inputs = Input(shape=(240,240,3))
    # Pass inputs through the base model; use training=False to avoid updating BatchNorm stats inadvertently.
    x = base_model(inputs, training=False)
    x = GlobalAveragePooling2D()(x)
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

best_model = build_best_model()
best_model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m74836368/74836368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [7]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Set up EarlyStopping to monitor validation loss:
early_stopping = EarlyStopping(
    monitor='val_loss',      # monitor the validation loss
    patience=2,              # stop if no improvement after 2 epochs
    restore_best_weights=True
)

# Set up ReduceLROnPlateau to lower the learning rate when the validation loss plateaus:
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',      # monitor the validation loss
    factor=0.5,              # reduce learning rate by a factor of 0.5
    patience=1,              # wait 1 epoch before reducing the LR
    min_lr=1e-6              # set a minimum learning rate
)

# Now train the model with these callbacks.
history_final = best_model.fit(
    train_ds,
    epochs=20,               # set a high max epochs; early stopping will stop training earlier if needed
    validation_data=val_ds,
    class_weight=class_weights,  # if you computed class weights in your earlier cells
    callbacks=[early_stopping, reduce_lr]
)


Epoch 1/20
  21715/Unknown [1m18643s[0m 838ms/step - accuracy: 0.9326 - loss: 0.1436

KeyboardInterrupt: 

In [None]:
# # Define callbacks
# early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True, verbose=1)
# reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

# EPOCHS = 10

# history = best_model.fit(
#     train_ds,
#     epochs=EPOCHS,
#     validation_data=val_ds,
#     callbacks=[early_stop, reduce_lr]
# )


In [None]:
val_loss, val_accuracy = best_model.evaluate(val_ds)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")


In [None]:
history_final=history

In [None]:
# Cell 12 (Optional): Visualize Training History

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

# Plot accuracy over epochs
plt.subplot(1, 2, 1)
plt.plot(history_final.history['accuracy'], label='Train Accuracy')
plt.plot(history_final.history['val_accuracy'], label='Validation Accuracy')
plt.title('Final Model Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss over epochs
plt.subplot(1, 2, 2)
plt.plot(history_final.history['loss'], label='Train Loss')
plt.plot(history_final.history['val_loss'], label='Validation Loss')
plt.title('Final Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [None]:
# Evaluate the model on the validation dataset
loss, accuracy = best_model.evaluate(val_ds)
print("Validation Loss: {:.4f}".format(loss))
print("Validation Accuracy: {:.2f}%".format(accuracy * 100))

# Import additional libraries for evaluation
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt

# Get predictions from the model (y_pred_probs are probabilities)
y_pred_probs = best_model.predict(val_ds)
# Convert probabilities to class predictions (binary classification with threshold 0.5)
y_pred = (y_pred_probs > 0.5).astype(int).reshape(-1)

# Retrieve true labels from the val_ds dataset
y_true = []
for images, labels in val_ds:
    y_true.extend(labels.numpy())
y_true = np.array(y_true)

# Compute the confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Compute and print the classification report (precision, recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

# Compute ROC-AUC score
roc_auc = roc_auc_score(y_true, y_pred_probs)
print("ROC-AUC Score: {:.2f}".format(roc_auc))

# Compute ROC curve values
fpr, tpr, thresholds = roc_curve(y_true, y_pred_probs)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="ROC curve (area = {:.2f})".format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()


In [None]:
    import matplotlib.pyplot as plt
    
    # Replace 'history' with the variable name returned by model.fit() if different (e.g., history_final)
    plt.figure(figsize=(14, 5))
    
    # Plot Accuracy Curves
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Training vs. Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot Loss Curves
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Training vs. Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# Replace 'history' with the variable name returned by model.fit() if different (e.g., history_final)
plt.figure(figsize=(14, 5))

# Plot Accuracy Curves
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training vs. Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot Loss Curves
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training vs. Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()
