In [None]:
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

## CNN Model Creation
The following function builds the basic CNN model that will be used as a "benchmark" model to compare how data impacts the results of the prediction model. The model consists of three convolutional layers, each followed by max-pooling to downsample features.
A fully connected dense layer helps learn patterns before the final classification layer.
The softmax output layer assigns probabilities for normal vs pneumonia classification.

In [None]:
# Define a function to create the CNN model
def create_cnn(input_shape=(224, 224, 3), num_classes=2):
    model = models.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(128, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Function to load dataset dynamically
def load_dataset(data_filepath, img_size=(224,224)):
    data = []

    # Loop through NORMAL and PNEUMONIA directories, and label images accordingly
    for label_name in ["NORMAL", "PNEUMONIA"]:
        label_dir = os.path.join(data_filepath, label_name)
        for file_name in os.listdir(label_dir):
            file_path = os.path.join(label_dir, file_name)
            # Ensure it's an image file (basic check)
            if file_name.endswith(('.png', '.jpg', '.jpeg')):
                data.append([dir, file_path, label_name])

        
    # Create DataFrame
    df = pd.DataFrame(data, columns=["directory", "image_path", "label"])
    return df

In [None]:
# Function to train and evaluate the model
def train_and_evaluate(model, train_ds, val_ds, test_ds, test_description="example_test"):
    results_path = f"{test_description}_results.csv"
    history = model.fit(train_ds, validation_data=val_ds, epochs=10)
    y_true, y_pred = [], []
    
    for images, labels in test_ds:
        preds = np.argmax(model.predict(images), axis=1)
        y_true.extend(labels.numpy())
        y_pred.extend(preds)
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    results_df = pd.DataFrame({
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1 Score': [f1]
    })
    
    results_df.to_csv(results_path, index=False)
    print("Metrics saved to", results_path)
    print(classification_report(y_true, y_pred))
    return history, y_true, y_pred, results_df

In [None]:
# Function to visualize results
def plot_results(history, y_true, y_pred):
    plt.figure(figsize=(10,5))
    plt.subplot(1,2,1)
    plt.plot(history.history['accuracy'], label='Train Acc')
    plt.plot(history.history['val_accuracy'], label='Val Acc')
    plt.legend()
    plt.title('Training & Validation Accuracy')
    
    plt.subplot(1,2,2)
    print("Accuracy:", accuracy_score(y_true, y_pred))
    plt.show()

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")

print("Path to dataset files:", path)
import shutil
shutil.move(path + "/chest_xray/chest_xray", "./chest_xray")

In [None]:
data_path = "./chest_xray"
test_description = "short_test_description"
train_ds = load_dataset(os.path.join(data_path, "train"))
val_ds = load_dataset(os.path.join(data_path, "val"))
test_ds = load_dataset(os.path.join(data_path, "test"))

cnn_model = create_cnn()
history, y_true, y_pred, results_df = train_and_evaluate(cnn_model, train_ds, val_ds, test_ds, test_description=test_description)
plot_results(history, y_true, y_pred)