### Libraries

In [1]:
# Import libraries

# Essential imports for ensemble learning pipeline
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# PyTorch deep learning framework
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
import torchvision
from torchvision import models, transforms
import timm  # For Xception and other models

# Data augmentation
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Machine learning utilities
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, roc_curve, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Genetic algorithm for hyperparameter optimization
import deap
from deap import base, creator, tools, algorithms
import random
from tqdm import tqdm
import time

import datetime
import os
from pathlib import Path
import h5py
import numpy as np
import pydicom
from pydicom.dataset import Dataset, FileMetaDataset
from pydicom.uid import generate_uid
import xmltodict

import pydicom
import numpy as np
from PIL import Image
from pathlib import Path
import os

from pathlib import Path

### GPU Setup

In [2]:
# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
random.seed(42)

# Set deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Environment setup complete.")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print(f"Using device: {device}")


Environment setup complete.
PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA device: NVIDIA GeForce RTX 3060
CUDA memory: 12.0 GB
Using device: cuda


### Data Processor Class

In [3]:
# Load and process annotations from knee.csv
class DataProcessor:
    def __init__(self, csv_path, png_dir):
        self.csv_path = Path(csv_path)
        self.png_dir = Path(png_dir)
        self.df = None
        self.subject_labels = {}
        
    def load_annotations(self):
        """Load and process knee annotations"""
        self.df = pd.read_csv(self.csv_path)
        print(f"Loaded {len(self.df)} annotations")
        print(f"Unique files: {self.df['file'].nunique()}")
        print(f"Label distribution:")
        print(self.df['label'].value_counts())
        return self.df
    
    def create_target_labels(self):
        """Create target labels: ACL tear, Meniscus tear, Neither - treating cases with both as separate entries"""
        # Map detailed labels to our target classes
        acl_keywords = ['ACL', 'Anterior Cruciate', 'Anterior Cruciate Ligament', 'ACL High Grade Sprain', 'ACL Low Grade sprain']
        meniscus_keywords = ['Meniscus', 'Meniscus Tear']
        
        subject_conditions = {}
        
        for file_id in self.df['file'].unique():
            file_data = self.df[self.df['file'] == file_id]
            labels = file_data['label'].tolist()
            
            has_acl = any(any(keyword.lower() in label.lower() for keyword in acl_keywords) for label in labels)
            has_meniscus = any(any(keyword.lower() in label.lower() for keyword in meniscus_keywords) for label in labels)
            
            # Instead of creating a "Both" class, we'll create separate entries
            if has_acl and has_meniscus:
                # Create two separate entries: one for ACL tear and one for Meniscus tear
                subject_conditions[f"{file_id}_ACL"] = 'ACL_tear'
                subject_conditions[f"{file_id}_Meniscus"] = 'Meniscus_tear'
            elif has_acl:
                subject_conditions[file_id] = 'ACL_tear'
            elif has_meniscus:
                subject_conditions[file_id] = 'Meniscus_tear'
            else:
                continue
        
        self.subject_labels = subject_conditions
        print("
Subject-level label distribution:")
        label_counts = pd.Series(list(subject_conditions.values())).value_counts()
        print(label_counts)
        
        return subject_conditions
    
    def get_bounding_boxes(self, file_id, slice_num):
        """Get bounding boxes for a specific file and slice"""
        slice_data = self.df[(self.df['file'] == file_id) & (self.df['slice'] == slice_num)]
        boxes = []
        for _, row in slice_data.iterrows():
            boxes.append({
                'x': row['x'], 'y': row['y'], 
                'width': row['width'], 'height': row['height'],
                'label': row['label']
            })
        return boxes
    
    def get_available_images(self):
        """Get list of available PNG images with their labels"""
        available_images = []
        
        for png_file in self.png_dir.glob('*.png'):
            # Extract file ID and slice from filename (e.g., file1000002_000.png)
            filename = png_file.stem
            parts = filename.split('_')
            if len(parts) >= 2:
                file_id = '_'.join(parts[:-1])  # Everything except last part
                slice_num = int(parts[-1])  # Last part is slice number
                
                # Check for both original file_id and the special ACL/Meniscus entries
                if file_id in self.subject_labels:
                    available_images.append({
                        'path': str(png_file),
                        'file_id': file_id,
                        'slice': slice_num,
                        'label': self.subject_labels[file_id]
                    })
                
                # Check for ACL-specific entry
                acl_key = f"{file_id}_ACL"
                if acl_key in self.subject_labels:
                    available_images.append({
                        'path': str(png_file),
                        'file_id': acl_key,  # Use the modified file_id to maintain uniqueness
                        'slice': slice_num,
                        'label': self.subject_labels[acl_key]
                    })
                
                # Check for Meniscus-specific entry
                meniscus_key = f"{file_id}_Meniscus"
                if meniscus_key in self.subject_labels:
                    available_images.append({
                        'path': str(png_file),
                        'file_id': meniscus_key,  # Use the modified file_id to maintain uniqueness
                        'slice': slice_num,
                        'label': self.subject_labels[meniscus_key]
                    })
        
        print(f"
Found {len(available_images)} available images")
        return available_images

In [4]:
# Initialize data processor and test the new functionality
data_processor = DataProcessor('/home/bictor0301/Code/JointWise/annotations/knee.csv', '/home/bictor0301/Code/JointWise/png-output')
annotations = data_processor.load_annotations()
subject_labels = data_processor.create_target_labels()
available_images = data_processor.get_available_images()

print(f"
Ready to process {len(available_images)} images from {len(subject_labels)} subject entries")

# Let's also check if there are any cases that were previously "Both"
print("
Checking for cases with both ACL and Meniscus tears:")
original_files = set()
acl_files = set()
meniscus_files = set()

for file_id, label in subject_labels.items():
    if file_id.endswith('_ACL'):
        acl_files.add(file_id[:-4])  # Remove _ACL suffix
    elif file_id.endswith('_Meniscus'):
        meniscus_files.add(file_id[:-9])  # Remove _Meniscus suffix
    else:
        original_files.add(file_id)

both_cases = acl_files.intersection(meniscus_files)
print(f"Found {len(both_cases)} cases that have both ACL and Meniscus tears (now treated as separate entries)")
if len(both_cases) > 0:
    print(f"Examples: {list(both_cases)[:5]}")  # Show first 5 examples

Loaded 16167 annotations
Unique files: 974
Label distribution:
label
Meniscus Tear                                5658
Cartilage - Partial Thickness loss/defect    2985
Joint Effusion                               1311
Bone-Fracture/Contusion/dislocation          1060
Bone- Subchondral edema                       986
Periarticular cysts                           864
Ligament - ACL Low Grade sprain               765
Ligament - ACL High Grade Sprain              677
Cartilage - Full Thickness loss/defect        615
Ligament - MCL Low-Mod Grade Sprain           285
Displaced Meniscal Tissue                     232
Bone - Lesion                                 183
Ligament - PCL Low-Mod grade sprain           142
LCL Complex - Low-Mod Grade Sprain            130
Soft Tissue Lesion                             90
Muscle Strain                                  65
Joint Bodies                                   38
Patellar Retinaculum - High grade sprain       24
Ligament - PCL High Grade      