In [1]:
import os

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [None]:
# CONFIG
IMG_SIZE = 64
CATEGORIES = ["NORMAL", "PNEUMONIA"]
DATASET_DIR = "./chest_Xray"
RESULTS_DIR = "./results_knn"
os.makedirs(RESULTS_DIR, exist_ok=True)

In [3]:
def load_images_from_folder(folder, label):
    images = []
    labels = []
    for filename in tqdm(os.listdir(folder)):
        img_path = os.path.join(folder, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            images.append(img)
            labels.append(label)
    return images, labels

In [None]:
def flatten_and_scale(X):
    X_flat = X.reshape(len(X), -1).astype(np.float32)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_flat)
    return X_scaled, scaler


def apply_pca(X_train, X_test, n_components=150):
    pca = PCA(n_components=n_components, svd_solver="randomized", random_state=42)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca, pca

In [5]:
def plot_distribution(distribution_dict):
    df = pd.DataFrame(distribution_dict).T
    df.plot(kind="bar", figsize=(10, 6))
    plt.title("Class Distribution per Dataset Split")
    plt.ylabel("Number of Images")
    plt.xlabel("Dataset Split")
    plt.xticks(rotation=0)
    plt.legend(title="Class")
    plt.grid(axis="y")
    plt.tight_layout()
    plt.show()


def save_confusion_matrix(y_true, y_pred, filename="confusion_matrix.png"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        xticklabels=CATEGORIES,
        yticklabels=CATEGORIES,
        cmap="Blues",
    )
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [None]:
print("Loading data from train and val folders...")

X_train = []
y_train = []

X_test = []
y_test = []

for dataset_type in ["train", "val"]:
    for idx, category in enumerate(CATEGORIES):
        folder_path = os.path.join(DATASET_DIR, dataset_type, category)
        imgs, labels = load_images_from_folder(folder_path, idx)
        X_train.extend(imgs)
        y_train.extend(labels)

for idx, category in enumerate(CATEGORIES):
    folder_path = os.path.join(DATASET_DIR, "test", category)
    imgs, labels = load_images_from_folder(folder_path, idx)
    X_test.extend(imgs)
    y_test.extend(labels)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = np.array(X_test)
y_test = np.array(y_test)

print(f"Loaded {len(X_train)} images. Splitting into train/test sets...")


print("Flattening and scaling...")
X_train_scaled, scaler = flatten_and_scale(X_train)
X_test_flat = X_test.reshape(len(X_test), -1).astype(np.float32)
X_test_scaled = scaler.transform(X_test_flat)

print("Applying PCA...")
n_components = min(150, X_train_scaled.shape[0], X_train_scaled.shape[1])
X_train_pca, X_test_pca, pca = apply_pca(
    X_train_scaled, X_test_scaled, n_components=n_components
)


In [15]:
print("Running GridSearchCV for KNN...")
param_grid = {
    "n_neighbors": [3, 5, 7, 9],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"],
    "algorithm": ["auto", "ball_tree"],
}

grid_search = GridSearchCV(
    KNeighborsClassifier(n_jobs=-1),
    param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=2,
)

grid_search.fit(X_train_pca, y_train)

Running GridSearchCV for KNN...
Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [37]:
# test the best model
print("Best model found:")
print(grid_search.best_estimator_)

print("Best parameters found:")
print(grid_search.best_params_)

print("Best cross-validation score:")
print(grid_search.best_score_)

print("Evaluating model on test set...")
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test_pca)

print("Saving classification report and confusion matrix...")
report_dict = classification_report(
    y_test, y_pred, target_names=CATEGORIES, output_dict=True
)
report_df = pd.DataFrame(report_dict).T
report_df = report_df.drop(index=["accuracy"])


Best model found:
KNeighborsClassifier(metric='euclidean', n_jobs=-1, n_neighbors=7)
Best parameters found:
{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
Best cross-validation score:
0.9647782269699553
Evaluating model on test set...
Saving classification report and confusion matrix...


In [38]:
report = classification_report(
    y_test, y_pred, target_names=CATEGORIES
)
print(report)

              precision    recall  f1-score   support

      NORMAL       0.96      0.41      0.57       234
   PNEUMONIA       0.74      0.99      0.84       390

    accuracy                           0.77       624
   macro avg       0.85      0.70      0.71       624
weighted avg       0.82      0.77      0.74       624



In [None]:
report_df.to_parquet(os.path.join(RESULTS_DIR, "classification_report.parquet"))

cm = confusion_matrix(y_test, y_pred)
cm_df = (
    pd.DataFrame(cm, index=CATEGORIES, columns=CATEGORIES)
    .reset_index()
    .rename(columns={"index": "Actual"})
)
cm_df.to_parquet(os.path.join(RESULTS_DIR, "confusion_matrix.parquet"))

save_confusion_matrix(y_test, y_pred, os.path.join(RESULTS_DIR, "confusion_matrix.png"))

print("Evaluation complete. All results saved.")


Evaluation complete. All results saved.
