In [22]:
# Standard library imports
import os
import time
import warnings
import random
import glob
import pickle
from pathlib import Path
from typing import Tuple, Dict, List

# Third-party imports
import numpy as np
import pandas as pd
import requests
import zipfile
import cv2
from PIL import Image
from skimage import io, transform
from skimage.feature import hog
from skimage import data, exposure
from skimage.color import rgb2gray
from skimage.transform import resize

# ML & utilities (kept at top for convenience)
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Data visualization
import matplotlib.pyplot as plt
plt.ion()  # interactive mode

# Ignore warnings
warnings.filterwarnings("ignore")

In [23]:
def load_pickle_image(file_path):
    with open(file_path, "rb") as f:         
        return pickle.load(f)                 

# Get all .pck file paths (don't load all images at once - too memory intensive!)
file_paths = []                              
for vol in range(1, 9):                      
    folder = f"archive/vol{vol:02d}"         
    file_paths += sorted(glob.glob(f"{folder}/*.pck"))  

print(f"Found {len(file_paths)} total image files")
# Load images individually as needed, not all at once

Found 736 total image files


In [24]:
# Associate labels from metadata.csv to image file paths
import pandas as pd
from collections import Counter

# Read metadata
meta = pd.read_csv("metadata.csv")
# Map filename -> numeric label
fn_to_label = dict(zip(meta['volumeFilename'], meta['aclDiagnosis']))
label_names = {0: 'Normal', 1: 'Torn', 2: 'Partially torn'}

# Build mapping list (file_path, numeric_label, label_name_or_None)
mapped = []
for p in file_paths:
    fname = os.path.basename(p)
    label = fn_to_label.get(fname, None)
    mapped.append((p, label, label_names.get(label) if label is not None else None))


# Summary
print("Label distribution (numeric):", Counter(l for _p, l, _n in mapped if l is not None))
print("Label distribution (names):", Counter(n for _p, l, n in mapped if n is not None))

# Show 10 examples
mapped[:10]

Label distribution (numeric): Counter({0: 547, 1: 144, 2: 45})
Label distribution (names): Counter({'Normal': 547, 'Torn': 144, 'Partially torn': 45})


[('archive/vol01/329637-8.pck', 0, 'Normal'),
 ('archive/vol01/390116-9.pck', 0, 'Normal'),
 ('archive/vol01/404663-8.pck', 1, 'Torn'),
 ('archive/vol01/406320-9.pck', 0, 'Normal'),
 ('archive/vol01/412857-8.pck', 0, 'Normal'),
 ('archive/vol01/412865-8.pck', 1, 'Torn'),
 ('archive/vol01/415102-9.pck', 0, 'Normal'),
 ('archive/vol01/425707-8.pck', 0, 'Normal'),
 ('archive/vol01/425713-8.pck', 0, 'Normal'),
 ('archive/vol01/437474-8.pck', 0, 'Normal')]

In [None]:
# Parameters
MAX_SAMPLES = 300
IMG_SIZE = (64, 64)
hog_params = dict(orientations=9, pixels_per_cell=(8,8), cells_per_block=(2,2))

# Build per-label lists then sample without loading files yet
by_label = {}
for p, label, _n in mapped:
    by_label.setdefault(label, []).append(p)

sampled = []
total = len(mapped)
for label, paths in by_label.items():
    k = max(1, int(len(paths) / total * MAX_SAMPLES))
    k = min(k, len(paths))
    sampled += [(p, label) for p in random.sample(paths, k)]
if len(sampled) > MAX_SAMPLES:
    sampled = random.sample(sampled, MAX_SAMPLES)

print(f'Using {len(sampled)} sampled items (seed=42)')

# Now load only sampled files and extract HOG features
X = []
y = []
for p, label in sampled:
    img = load_pickle_image(p)
    arr = np.asarray(img)
    sl = arr[arr.shape[0] // 2] if arr.ndim == 3 else arr
    if sl.ndim == 3:
        sl = rgb2gray(sl)
    sl_resized = resize(sl, IMG_SIZE, anti_aliasing=True)
    fd = hog((sl_resized * 255).astype(np.uint8), orientations=hog_params['orientations'], pixels_per_cell=hog_params['pixels_per_cell'], cells_per_block=hog_params['cells_per_block'], visualize=False, feature_vector=True)
    X.append(fd)
    y.append(label)

X = np.vstack(X)
y = np.array(y)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# SVM
svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
svm.fit(X_tr, y_tr)
y_svm = svm.predict(X_te)
print('\nSVM acc:', accuracy_score(y_te, y_svm))
print(classification_report(y_te, y_svm))

# Logistic Regression
logr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logr.fit(X_tr, y_tr)
y_log = logr.predict(X_te)
print('\nLogReg acc:', accuracy_score(y_te, y_log))
print(classification_report(y_te, y_log))


import tensorflow as tf
from tensorflow.keras import layers, models

# Build small image dataset from sampled files
X_img = []
y_img = []
for p, label in sampled:
    img = load_pickle_image(p)
    arr = np.asarray(img)
    sl = arr[arr.shape[0] // 2] if arr.ndim == 3 else arr
    if sl.ndim == 3:
        sl = rgb2gray(sl)
    sl_resized = resize(sl, IMG_SIZE, anti_aliasing=True)
    X_img.append(np.expand_dims(sl_resized.astype(np.float32), -1))
    y_img.append(label)
X_img = np.stack(X_img)
y_img = np.array(y_img)

Xtr_img, Xte_img, ytr_img, yte_img = train_test_split(X_img, y_img, test_size=0.2, stratify=y_img, random_state=42)

n_classes = len(np.unique(y_img))
model = models.Sequential([
    layers.Input(shape=Xtr_img.shape[1:]),
    layers.Conv2D(16, 3, activation='relu'),
    layers.MaxPool2D(),
    layers.Conv2D(32, 3, activation='relu'),
    layers.MaxPool2D(),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(n_classes, activation='softmax'),
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print('\nTraining small CNN (3 epochs)...')
model.fit(Xtr_img, ytr_img, epochs=3, batch_size=16, validation_data=(Xte_img, yte_img), verbose=2)
loss, acc = model.evaluate(Xte_img, yte_img, verbose=0)
print('\nCNN test acc:', acc)

Using 298 sampled items (seed=42)

SVM acc: 0.7
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        45
           1       0.33      0.42      0.37        12
           2       0.00      0.00      0.00         3

    accuracy                           0.70        60
   macro avg       0.39      0.41      0.40        60
weighted avg       0.70      0.70      0.70        60


LogReg acc: 0.7166666666666667
              precision    recall  f1-score   support

           0       0.85      0.89      0.87        45
           1       0.33      0.25      0.29        12
           2       0.00      0.00      0.00         3

    accuracy                           0.72        60
   macro avg       0.39      0.38      0.39        60
weighted avg       0.70      0.72      0.71        60


SVM acc: 0.7
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        45
           1       0.33      0.42  