In [62]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import cv2
import matplotlib.pyplot as plt

In [63]:
# Load Data Function
def load_images_from_folder(folder, label):
    images = []
    labels = []
    for filename in tqdm(os.listdir(folder)):
        img_path = os.path.join(folder, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (128, 128))
            images.append(img)
            labels.append(label)
    return images, labels

In [64]:
X_train = []
y_train = []

X_test = []
y_test = []
CATEGORIES = ["NORMAL", "PNEUMONIA"]
DATASET_DIR = "chest_xray"

# Load images from folders
for dataset_type in ["train", "val"]:
    for idx, category in enumerate(CATEGORIES):
        folder_path = os.path.join(DATASET_DIR, dataset_type, category)
        imgs, labels = load_images_from_folder(folder_path, idx)
        X_train.extend(imgs)
        y_train.extend(labels)

for idx, category in enumerate(CATEGORIES):
    folder_path = os.path.join(DATASET_DIR, "test", category)
    imgs, labels = load_images_from_folder(folder_path, idx)
    X_test.extend(imgs)
    y_test.extend(labels)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

100%|██████████| 1342/1342 [01:20<00:00, 16.75it/s]
100%|██████████| 3876/3876 [01:01<00:00, 63.22it/s] 
100%|██████████| 9/9 [00:00<00:00, 25.38it/s]
100%|██████████| 9/9 [00:00<00:00, 63.07it/s]
100%|██████████| 234/234 [00:14<00:00, 16.18it/s]
100%|██████████| 390/390 [00:09<00:00, 40.06it/s]


In [65]:
def plot_confusion_matrix(y_true, y_pred, categories):
    cm = confusion_matrix(y_true, y_pred)
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=categories,
            y=categories,
            colorscale="Blues",
            zmin=0,
            zmax=np.max(cm),
            colorbar=dict(title="Count"),
            text=cm,
            texttemplate="%{text}",
            showscale=True,
        )
    )
    fig.update_layout(
        title="Confusion Matrix",
        xaxis_title="Predicted",
        yaxis_title="True",
        height=500,
        width=600,
    )
    fig.show()

In [66]:
# Define flatten and scale function (this was missing)
def flatten_and_scale(X):
    """Flatten images and apply standardization"""
    X_flat = X.reshape(len(X), -1).astype(np.float32)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_flat)
    return X_scaled, scaler

# Define adaptive PCA function
def apply_pca(X_train, X_test, n_components=150):
    pca = PCA(n_components=n_components, svd_solver='randomized', random_state=42)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca, pca

# Preprocess Data
print("Flattening and scaling...")
X_train_scaled, scaler = flatten_and_scale(X_train)
X_test_flat = X_test.reshape(len(X_test), -1).astype(np.float32)
X_test_scaled = scaler.transform(X_test_flat)

print("Applying PCA...")
n_components = min(150, X_train_scaled.shape[0], X_train_scaled.shape[1])
X_train_pca, X_test_pca, pca = apply_pca(
    X_train_scaled, X_test_scaled, n_components=n_components
)

print(f"Original feature dimensions: {X_train_scaled.shape[1]}")
print(f"Reduced feature dimensions: {X_train_pca.shape[1]}")

Flattening and scaling...
Applying PCA...
Original feature dimensions: 16384
Reduced feature dimensions: 150


In [67]:
base_path = "chest_xray"
categories = ["NORMAL", "PNEUMONIA"]

print("\nChest X-ray Analysis: Exploratory Data Analysis")
print(f"Dataset train split shape: {X_train.shape}")
print(f"X-ray image size: {X_train.shape[1]}x{X_train.shape[2]} pixels\n")

# Check class distribution
print(f"Class distribution in training set:")
print(f"NORMAL: {np.sum(y_train == 0)} ({np.mean(y_train == 0)*100:.1f}%)")
print(f"PNEUMONIA: {np.sum(y_train == 1)} ({np.mean(y_train == 1)*100:.1f}%)")

explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

print(f"Explained variance by {n_components} components: {cumulative_variance[-1]*100:.2f}%")

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=list(range(1, len(cumulative_variance) + 1)),
        y=cumulative_variance,
        mode="lines+markers",
        name="Cumulative Explained Variance",
    )
)
fig.update_layout(
    title="PCA Explained Variance",
    xaxis_title="Principal Components",
    yaxis_title="Cumulative Variance",
    height=400,
    width=600,
)
fig.show()


Chest X-ray Analysis: Exploratory Data Analysis
Dataset train split shape: (5232, 128, 128)
X-ray image size: 128x128 pixels

Class distribution in training set:
NORMAL: 1349 (25.8%)
PNEUMONIA: 3883 (74.2%)
Explained variance by 150 components: 88.33%


In [None]:
print("\nOptimizing SVM Hyperparameters using GridSearchCV...")

# Enhanced parameter grid
param_grid = [
    {
        'C': [0.01, 0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
        'kernel': ['rbf'],
        'class_weight': ['balanced']
    }
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

svm = SVC(random_state=10, probability=True)
grid_search = GridSearchCV(
    estimator=svm, 
    param_grid=param_grid, 
    cv=cv, 
    scoring='roc_auc', 
    n_jobs=-1, 
    verbose=2
)

grid_search.fit(X_train_pca, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_pca)
y_pred_proba = best_model.predict_proba(X_test_pca)[:, 1]

print("\nBest parameters found:", grid_search.best_params_)
print("Best cross-validation AUC score:", grid_search.best_score_)

# Calculate test AUC
test_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Test AUC: {test_auc:.4f}")

print("\nBest Model Performance:")
print(classification_report(y_test, y_pred, target_names=categories, digits=4))

# Save results
results_dir = "results_svm"
os.makedirs(results_dir, exist_ok=True)

classification_report_dict = classification_report(
    y_test, y_pred, target_names=categories, output_dict=True
)
classification_report_df = pd.DataFrame(classification_report_dict).transpose()
classification_report_df.to_parquet(f"{results_dir}/svm_classification_report.parquet")
print("\nClassification Report saved to parquet.")

confusion_mat = confusion_matrix(y_test, y_pred)
confusion_matrix_df = pd.DataFrame(confusion_mat, columns=categories, index=categories)
confusion_matrix_df.to_parquet(f"{results_dir}/svm_confusion_matrix.parquet")
print("Confusion Matrix saved to parquet.")

# Save model performance metrics
performance_metrics = {
    'best_params': [str(grid_search.best_params_)],
    'cv_auc_score': [grid_search.best_score_],
    'test_auc_score': [test_auc],
    'test_accuracy': [classification_report_dict['accuracy']],
    'test_f1_score': [classification_report_dict['weighted avg']['f1-score']]
}
performance_df = pd.DataFrame(performance_metrics)
performance_df.to_parquet(f"{results_dir}/svm_performance_metrics.parquet")

plot_confusion_matrix(y_test, y_pred, categories)

print(f"\nAll results saved to {results_dir}/")


Optimizing SVM Hyperparameters using GridSearchCV...
Fitting 5 folds for each of 25 candidates, totalling 125 fits

Best parameters found: {'C': 1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation AUC score: 0.9952872394917837
Test AUC: 0.9187

Best Model Performance:
              precision    recall  f1-score   support

      NORMAL     0.9339    0.4829    0.6366       234
   PNEUMONIA     0.7594    0.9795    0.8555       390

    accuracy                         0.7933       624
   macro avg     0.8467    0.7312    0.7461       624
weighted avg     0.8249    0.7933    0.7734       624


Classification Report saved to parquet.
Confusion Matrix saved to parquet.



All results saved to results_svm/
