In [1]:
# Standard library imports
import os
import time
import warnings
import random
import glob
import pickle
from pathlib import Path
from typing import Tuple, Dict, List
import numpy as np
import pandas as pd
import requests
import zipfile
import cv2
from PIL import Image
from skimage import io, transform
from skimage.feature import hog
from skimage import data, exposure
from skimage.color import rgb2gray
from skimage.transform import resize
from collections import Counter

# ML & utilities 
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Data visualization
import matplotlib.pyplot as plt
plt.ion()  

warnings.filterwarnings("ignore")

In [2]:
def load_pickle_image(file_path):
    with open(file_path, "rb") as f:         
        return pickle.load(f)                 

file_paths = []                              
for vol in range(1, 9):                      
    folder = f"archive/vol{vol:02d}"         
    file_paths += sorted(glob.glob(f"{folder}/*.pck"))  

print(f"Found {len(file_paths)} total image files")

Found 736 total image files


In [3]:
# Associate labels from metadata.csv to image file paths

meta = pd.read_csv("metadata.csv")
# Map filename -> numeric label
fn_to_label = dict(zip(meta['volumeFilename'], meta['aclDiagnosis']))
label_names = {0: 'Normal', 1: 'Torn', 2: 'Partially torn'}

# Build mapping list (file_path, numeric_label, label_name_or_None)
mapped = []
for p in file_paths:
    fname = os.path.basename(p)
    label = fn_to_label.get(fname, None)
    mapped.append((p, label, label_names.get(label) if label is not None else None))


# Summary
print("Label distribution (numeric):", Counter(l for _p, l, _n in mapped if l is not None))
print("Label distribution (names):", Counter(n for _p, l, n in mapped if n is not None))

Label distribution (numeric): Counter({0: 547, 1: 144, 2: 45})
Label distribution (names): Counter({'Normal': 547, 'Torn': 144, 'Partially torn': 45})


In [None]:
# Improved balanced training with simple augmentations to address class imbalance
MAX_SAMPLES = 300
IMG_SIZE = (64, 64)
hog_params = dict(orientations=9, pixels_per_cell=(8,8), cells_per_block=(2,2))
AUG_PER_SAMPLE = 1  # augment each drawn sample once when needed

# Build per-label lists
by_label = {}
for p, label, _n in mapped:
    if label is None:
        continue
    by_label.setdefault(label, []).append(p)

classes = sorted(by_label.keys())
n_classes = len(classes)
if n_classes == 0:
    raise RuntimeError('No labeled samples found')
target_per_class = max(1, MAX_SAMPLES // n_classes)

# Helper augmentation (works on resized 2D slices)
from skimage.transform import rotate as _rotate
def augment_slice(img):
    a = img.copy()
    if random.random() < 0.5:
        a = np.fliplr(a)
    ang = random.uniform(-10, 10)
    a = _rotate(a, ang, mode='edge', preserve_range=True)
    return a

fds = []
imgs = []
ys = []
failed = []
t0 = time.time()
for label in classes:
    paths = by_label[label]
    if len(paths) == 0:
        continue
    # Sample with replacement until we reach target_per_class
    while sum(1 for y in ys if y == label) < target_per_class:
        p = random.choice(paths)
        try:
            img = load_pickle_image(p)
        except Exception as e:
            failed.append((p, str(e)))
            continue
        arr = np.asarray(img)
        sl = arr[arr.shape[0] // 2] if arr.ndim == 3 else arr
        if sl.ndim == 3:
            sl = rgb2gray(sl)
        sl_resized = resize(sl, IMG_SIZE, anti_aliasing=True)
        # original
        fd = hog((sl_resized * 255).astype(np.uint8), orientations=hog_params['orientations'], pixels_per_cell=hog_params['pixels_per_cell'], cells_per_block=hog_params['cells_per_block'], visualize=False, feature_vector=True)
        fds.append(fd)
        imgs.append(np.expand_dims(sl_resized.astype(np.float32), -1))
        ys.append(label)
        # augmentation if still need more for this class
        if sum(1 for y in ys if y == label) < target_per_class:
            for _ in range(AUG_PER_SAMPLE):
                aug = augment_slice(sl_resized)
                fd2 = hog((aug * 255).astype(np.uint8), orientations=hog_params['orientations'], pixels_per_cell=hog_params['pixels_per_cell'], cells_per_block=hog_params['cells_per_block'], visualize=False, feature_vector=True)
                fds.append(fd2)
                imgs.append(np.expand_dims(aug.astype(np.float32), -1))
                ys.append(label)
t1 = time.time()
print(f'Preprocessing done — {len(fds)} items (failed {len(failed)}) in {t1-t0:.1f}s')

# Truncate or pad to MAX_SAMPLES
if len(fds) > MAX_SAMPLES:
    idx = random.sample(range(len(fds)), MAX_SAMPLES)
    fds = [fds[i] for i in idx]
    imgs = [imgs[i] for i in idx]
    ys = [ys[i] for i in idx]

X = np.vstack(fds)
y = np.array(ys)
X_img = np.stack(imgs)
y_img = np.array(ys)

# Compute class weights for CNN
from sklearn.utils import class_weight
unique = np.unique(y_img)
cw = class_weight.compute_class_weight('balanced', classes=unique, y=y_img)
class_weight_dict = {int(c): float(w) for c, w in zip(unique, cw)}
print('Class weights:', class_weight_dict)

# Train/test split for HOG features
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# SVM (balanced)
svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
svm.fit(X_tr, y_tr)
y_svm = svm.predict(X_te)
print('\nSVM acc:', accuracy_score(y_te, y_svm))
print(classification_report(y_te, y_svm))

# Logistic Regression
logr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logr.fit(X_tr, y_tr)
y_log = logr.predict(X_te)
print('\nLogReg acc:', accuracy_score(y_te, y_log))
print(classification_report(y_te, y_log))

# CNN training with class weights and small augmentations baked in
Xtr_img, Xte_img, ytr_img, yte_img = train_test_split(X_img, y_img, test_size=0.2, stratify=y_img, random_state=42)
n_classes = len(np.unique(y_img))
model = models.Sequential([
    layers.Input(shape=Xtr_img.shape[1:]),
    layers.RandomFlip('horizontal'),
    layers.RandomRotation(0.05),
    layers.Conv2D(16, 3, activation='relu'),
    layers.MaxPool2D(),
    layers.Conv2D(32, 3, activation='relu'),
    layers.MaxPool2D(),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(n_classes, activation='softmax'),
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print('\nTraining small CNN (5 epochs) with class weights...')
model.fit(Xtr_img, ytr_img, epochs=5, batch_size=16, validation_data=(Xte_img, yte_img), class_weight=class_weight_dict, verbose=2)
loss, acc = model.evaluate(Xte_img, yte_img, verbose=0)
print('\nCNN test acc:', acc)

Preprocessing done — 300 items (failed 0) in 133.3s
Class weights: {0: 1.0, 1: 1.0, 2: 1.0}

SVM acc: 0.5833333333333334
              precision    recall  f1-score   support

           0       0.52      0.70      0.60        20
           1       0.58      0.35      0.44        20
           2       0.67      0.70      0.68        20

    accuracy                           0.58        60
   macro avg       0.59      0.58      0.57        60
weighted avg       0.59      0.58      0.57        60


LogReg acc: 0.6333333333333333
              precision    recall  f1-score   support

           0       0.60      0.60      0.60        20
           1       0.69      0.45      0.55        20
           2       0.63      0.85      0.72        20

    accuracy                           0.63        60
   macro avg       0.64      0.63      0.62        60
weighted avg       0.64      0.63      0.62        60


Training small CNN (5 epochs) with class weights...
Epoch 1/5
15/15 - 1s - 48ms/step