In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import chain
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input


print("TensorFlow Version:", tf.__version__)


2025-07-09 14:17:22.319547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752070642.579576      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752070642.650673      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TensorFlow Version: 2.18.0


In [3]:
IMAGE_SIZE = 224
BATCH_SIZE_PER_REPLICA = 32
SEED = 42
DATA_DIR = '/kaggle/input/data'

In [4]:

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    print('Running on TPU')
except ValueError:
    strategy = tf.distribute.MirroredStrategy()
    print('Running on GPU(s)')

print("REPLICAS: ", strategy.num_replicas_in_sync)

BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
print(f"Effective batch size: {BATCH_SIZE}")



Running on GPU(s)
REPLICAS:  1
Effective batch size: 32


2025-07-09 14:19:05.009587: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [5]:

try:
    df = pd.read_csv(os.path.join(DATA_DIR, 'Data_Entry_2017.csv'))
    print("Metadata loaded successfully.")
except FileNotFoundError:
    print(f"Error: 'Data_Entry_2017.csv' not found in '{DATA_DIR}'.")
    print("Please ensure the dataset is available and the DATA_DIR path is correct.")
    # Exit or raise an exception if the file is not found
    exit()


Metadata loaded successfully.


In [None]:
# Create a complete mapping of image filenames to their full paths
all_image_paths = {os.path.basename(p): p for p in glob.glob(os.path.join(DATA_DIR, '**', '*.png'), recursive=True)}
df['path'] = df['Image Index'].map(all_image_paths.get)

# Drop rows with no valid image path
df = df.dropna(subset=['path'])
print(f"Found {len(df)} images with corresponding metadata.")


In [None]:
# Identify all unique pathology labels
all_labels = np.unique(list(chain.from_iterable(df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
# 'No Finding' is the absence of other labels, so we treat it as the baseline
all_labels = [label for label in all_labels if label!= 'No Finding']
print(f'All Labels ({len(all_labels)}): {all_labels}')
# Create multi-hot encoded columns for each pathology
for label in all_labels:
    df[label] = df['Finding Labels'].map(lambda finding: 1.0 if label in finding else 0.0)

print("\nSample of processed DataFrame:")
print(df.head())


In [None]:

pathology_counts = df[all_labels].sum().sort_values(ascending=False)

plt.figure(figsize=(18, 8))
sns.barplot(x=pathology_counts.index, y=pathology_counts.values, palette='viridis')
plt.title('Distribution of Pathologies in NIH Chest X-ray Dataset', fontsize=16)
plt.ylabel('Number of Cases', fontsize=12)
plt.xlabel('Pathology', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.show()

print("\nPathology Counts:")
print(pathology_counts)



In [None]:



# %%
from sklearn.model_selection import train_test_split

# Extract unique patient IDs
patient_ids = df['Patient ID'].unique()

# Split patient IDs: 80% for training/validation, 20% for testing
train_val_ids, test_ids = train_test_split(patient_ids, test_size=0.2, random_state=SEED)

# Split the first group further: 90% for training, 10% for validation
train_ids, val_ids = train_test_split(train_val_ids, test_size=0.1, random_state=SEED)

# Create dataframes based on the patient ID splits
train_df = df[df['Patient ID'].isin(train_ids)]
val_df = df[df['Patient ID'].isin(val_ids)]
test_df = df[df['Patient ID'].isin(test_ids)]

# Print summary
print(f"Total Patients: {len(patient_ids)}")
print(f"Train Patients: {len(train_ids)}, Validation Patients: {len(val_ids)}, Test Patients: {len(test_ids)}")
print(f"Train Samples: {len(train_df)}, Validation Samples: {len(val_df)}, Test Samples: {len(test_df)}")

# %%
# We calculate class weights from the TRAINING SET ONLY to avoid data leakage.
# These weights will be used in a custom loss function to penalize errors
# on minority classes more heavily.

# Count positive and negative cases for each class in the training set
pos_counts = train_df[all_labels].sum()
neg_counts = len(train_df) - pos_counts

# Create a (num_classes, 2) tensor of weights
# weights[i, 1] is for the positive case of class i, weights[i, 0] is for the negative
weights = np.zeros((len(all_labels), 2))
total_samples = len(train_df)
for i, label in enumerate(all_labels):
    # Weight for the positive class
    pos_weight = (1 / pos_counts[label]) * (total_samples / 2.0)
    # Weight for the negative class
    neg_weight = (1 / neg_counts[label]) * (total_samples / 2.0)
    weights[i, 1] = pos_weight
    weights[i, 0] = neg_weight

print("Weights calculated successfully.")
print("Sample Positive Weights:", {label: f"{w:.2f}" for label, w in zip(all_labels[:5], weights[:5, 1])})
print("Sample Negative Weights:", {label: f"{w:.2f}" for label, w in zip(all_labels[:5], weights[:5, 0])})

# Visualize the positive class weights
plt.figure(figsize=(18, 8))
sns.barplot(x=all_labels, y=weights[:, 1], palette='rocket')
plt.title('Calculated Positive Class Weights for Each Pathology', fontsize=16)
plt.ylabel('Weight', fontsize=12)
plt.xlabel('Pathology', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.show()

# %%
data_augmentation_pipeline = keras.Sequential([\
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
    layers.RandomBrightness(0.1)
], name="data_augmentation")

# Visualize augmentation effects
sample_image_path = train_df.iloc[0]['path']
sample_image = tf.io.read_file(sample_image_path)
sample_image = tf.image.decode_png(sample_image, channels=3)

sample_image = tf.image.resize(sample_image, (IMAGE_SIZE, IMAGE_SIZE))


plt.figure(figsize=(10, 10))
plt.suptitle("Data Augmentation Examples", fontsize=16)
for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    # The data augmentation pipeline expects a batch of images.
    # tf.expand_dims adds the batch dimension.
    augmented_image_batch = data_augmentation_pipeline(tf.expand_dims(sample_image, 0), training=True)
    # We remove the batch dimension to display a single image.
    augmented_image = augmented_image_batch[0]
    # Keras layers output float tensors. For display, we might need to cast to uint8
    # if the values are not in the [0, 1] range. Normalizing to [0, 1] is safer.
    plt.imshow(augmented_image / 255.0)
    plt.axis("off")
plt.show()

# %%
def parse_image(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image,size=[IMAGE_SIZE, IMAGE_SIZE],method=tf.image.ResizeMethod.BILINEAR,
    preserve_aspect_ratio=False,
    antialias=False,
    name=None)

    return image, label

def create_dataset(df, augment=False):
    # This function is now simplified, augmentation is handled in the model
    # Create a dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((df['path'].values, df[all_labels].values))

    AUTOTUNE = tf.data.AUTOTUNE

    # Map the parsing function
    dataset = dataset.map(parse_image, num_parallel_calls=AUTOTUNE)

    # Configure for performance
    # NOTE: The augmentation step is removed from here
    dataset = dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
    return dataset

# Create the datasets
train_ds = create_dataset(train_df, augment=True)
val_ds = create_dataset(val_df, augment=False)
test_ds = create_dataset(test_df, augment=False)

print("tf.data pipelines created successfully.")

# %%
# --- Custom Weighted Loss Function ---
def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        weights_tensor = tf.constant(weights, dtype=tf.float32)
        # The core logic: multiply the loss for each class by its corresponding weight.
        # tf.where selects the appropriate weight based on the true label.
        loss_weights = tf.where(tf.equal(y_true, 1.0), weights_tensor[:, 1], weights_tensor[:, 0])
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        weighted_bce = loss_weights * bce
        return tf.reduce_mean(weighted_bce)
    return weighted_loss

# %%
with strategy.scope():
    base_model = EfficientNetB0(
        input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
        include_top=False,
        weights='imagenet'
    )
    base_model.trainable = False

    inputs = tf.keras.layers.Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
    x = data_augmentation_pipeline(inputs)
    x = preprocess_input(x) # EfficientNet preprocessing
    x = base_model(x, training=False)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    outputs = tf.keras.layers.Dense(len(all_labels), activation='sigmoid')(x)
    model = tf.keras.Model(inputs, outputs)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss=get_weighted_loss(weights),
        metrics=[tf.keras.metrics.AUC(name='auc_roc', multi_label=True)]
    )

model.summary()

# %%
print("Starting initial training (feature extraction)...")

# Define callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_auc_roc', patience=3, mode='max', restore_best_weights=True)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'initial_training_weights.weights.h5',
    save_weights_only=True,
    monitor='val_auc_roc',
    mode='max',
    save_best_only=True
)

checkpoint_model = tf.keras.callbacks.ModelCheckpoint(
    'initial_training_model.keras',
    save_weights_only=False,
    monitor='val_auc_roc',
    mode='max',
    save_best_only=True,
    verbose=1
)

history = model.fit(
    train_ds,
    epochs=10,
    validation_data=val_ds,
    callbacks=[early_stopping, model_checkpoint]
)

print("Initial training complete.")

# %%
def plot_training_history(history):
    """Plots training and validation loss and AUC."""
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

    ax1.plot(history.history['loss'], label='Training Loss')
    ax1.plot(history.history['val_loss'], label='Validation Loss')
    ax1.set_title('Model Loss', fontsize=16)
    ax1.set_ylabel('Loss')
    ax1.set_xlabel('Epoch')
    ax1.legend(loc='upper right')

    # Plot AUC
    ax2.plot(history.history['auc_roc'], label='Training AUC')
    ax2.plot(history.history['val_auc_roc'], label='Validation AUC')
    ax2.set_title('Model AUC', fontsize=16)
    ax2.set_ylabel('AUC')
    ax2.set_xlabel('Epoch')
    ax2.legend(loc='lower right')

    plt.tight_layout()
    plt.show()

plot_training_history(history)
# %%
# Fine-tuning: unfreeze some layers of the base model and train with a lower learning rate.
print("Starting fine-tuning...")

# Unfreeze the top layers of the model
base_model.trainable = True

# Let's unfreeze the top 20 layers.
for layer in base_model.layers[:-20]:
    layer.trainable = False

# Recompile the model with a lower learning rate
with strategy.scope():
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss=get_weighted_loss(weights),
        metrics=[tf.keras.metrics.AUC(name='auc_roc', multi_label=True)]
    )

# Define new callbacks for fine-tuning
fine_tune_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_auc_roc', patience=3, mode='max', restore_best_weights=True)
fine_tune_model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'fine_tuning_weights.weights.h5',
    save_weights_only=True,
    monitor='val_auc_roc',
    mode='max',
    save_best_only=True
)

fine_tune_checkpoint_model = tf.keras.callbacks.ModelCheckpoint(
    'fine_tuning_model.keras',
    save_weights_only=False,
    monitor='val_auc_roc',
    mode='max',
    save_best_only=True,
    verbose=1
)

# Continue training
history_fine_tune = model.fit(
    train_ds,
    epochs=10,
    validation_data=val_ds,
    callbacks=[fine_tune_early_stopping, fine_tune_model_checkpoint]
)

print("Fine-tuning complete.")

# %%
def plot_fine_tuning_history(initial_history, fine_tune_history):
    """Plots combined training history of initial and fine-tuning phases."""
    acc = initial_history.history['auc_roc'] + fine_tune_history.history['auc_roc']
    val_acc = initial_history.history['val_auc_roc'] + fine_tune_history.history['val_auc_roc']
    loss = initial_history.history['loss'] + fine_tune_history.history['loss']
    val_loss = initial_history.history['val_loss'] + fine_tune_history.history['val_loss']

    plt.figure(figsize=(12, 12))
    plt.subplot(2, 1, 1)
    plt.plot(acc, label='Training AUC')
    plt.plot(val_acc, label='Validation AUC')
    plt.ylim([min(plt.ylim()), 1])
    plt.plot([len(initial_history.history['auc_roc'])-1, len(initial_history.history['auc_roc'])-1],
              plt.ylim(), label='Start Fine Tuning')
    plt.legend(loc='lower right')
    plt.title('Training and Validation AUC', fontsize=16)

    plt.subplot(2, 1, 2)
    plt.plot(loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss', fontsize=16)
    plt.xlabel('epoch')
    plt.show()

plot_fine_tuning_history(history, history_fine_tune)
# %%
# Load the best weights from fine-tuning for evaluation
model.load_weights('fine_tuning_weights.weights.h5')

print("Evaluating on the test set...")
test_loss, test_auc = model.evaluate(test_ds)
print(f"Test Loss: {test_loss}")
print(f"Test AUC: {test_auc}")
# %%
# Generate predictions for the test set
y_pred = model.predict(test_ds)

# Generate classification report
print("Classification Report:")
print(classification_report(np.round(test_df[all_labels].values), np.round(y_pred), target_names=all_labels))

# Plot ROC curves for each pathology
plt.figure(figsize=(15, 15))
for i, label in enumerate(all_labels):
    fpr, tpr, _ = roc_curve(test_df[all_labels].values[:, i], y_pred[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Each Pathology')
plt.legend(loc="lower right")
plt.show()