# Install Required Libraries
-  The project is implemented using following libraries in Python:

In [None]:
pip install pandas numpy opencv-python matplotlib scikit-learn mahotas tqdm scikit-image

In [1]:
import pandas as pd
import os
import cv2
import numpy as np
from tqdm import tqdm
import joblib
from joblib import parallel_backend
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from skimage.feature import hog, graycomatrix, graycoprops
import mahotas
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.feature_selection import VarianceThreshold

# Read the Data
- Load the Metadata CSV file:

In [7]:
# Path to dataset folder
DATASET_PATH = "dataset" 
IMAGE_DIRS = [os.path.join(DATASET_PATH, f"images_{str(i).zfill(3)}/images") for i in range(1, 13)]  # images_001 to 12
CSV_PATH = os.path.join(DATASET_PATH, "Data_Entry_2017.csv")

# Load dataset metadata
df = pd.read_csv(CSV_PATH)
df = df.drop(columns=["Unnamed: 11"], errors="ignore")

# Show dataset info
print("Dataset shape:", df.shape)
print(df)

Dataset shape: (112120, 11)
             Image Index          Finding Labels  Follow-up #  Patient ID  \
0       00000001_000.png            Cardiomegaly            0           1   
1       00000001_001.png  Cardiomegaly|Emphysema            1           1   
2       00000001_002.png   Cardiomegaly|Effusion            2           1   
3       00000002_000.png              No Finding            0           2   
4       00000003_000.png                  Hernia            0           3   
...                  ...                     ...          ...         ...   
112115  00030801_001.png          Mass|Pneumonia            1       30801   
112116  00030802_000.png              No Finding            0       30802   
112117  00030803_000.png              No Finding            0       30803   
112118  00030804_000.png              No Finding            0       30804   
112119  00030805_000.png              No Finding            0       30805   

        Patient Age Patient Gender View Positio

- Dataset Information:

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112120 entries, 0 to 112119
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Image Index                  112120 non-null  object 
 1   Finding Labels               112120 non-null  object 
 2   Follow-up #                  112120 non-null  int64  
 3   Patient ID                   112120 non-null  int64  
 4   Patient Age                  112120 non-null  int64  
 5   Patient Gender               112120 non-null  object 
 6   View Position                112120 non-null  object 
 7   OriginalImage[Width          112120 non-null  int64  
 8   Height]                      112120 non-null  int64  
 9   OriginalImagePixelSpacing[x  112120 non-null  float64
 10  y]                           112120 non-null  float64
dtypes: float64(2), int64(5), object(4)
memory usage: 9.4+ MB


# Define Labels for Classification:

In [9]:
# List of all 14 diseases
all_diseases = [
    "Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Effusion",
    "Emphysema", "Fibrosis", "Hernia", "Infiltration", "Mass", "Nodule",
    "Pleural_Thickening", "Pneumonia", "Pneumothorax"
]

# 🔹 Binary Classification (Finding vs. No Finding)
df["Binary_Label"] = df["Finding Labels"].apply(lambda x: 0 if x == "No Finding" else 1)

# 🔹 Multi-Label Classification
for disease in all_diseases:
    df[disease] = df["Finding Labels"].apply(lambda x: 1 if disease in x else 0)

# Keep only necessary columns
df = df[["Image Index", "Binary_Label"] + all_diseases]

# Show processed data
print(df.head())


OUTPUT_LABELS_DIR = "classification_labels"
os.makedirs(OUTPUT_LABELS_DIR, exist_ok=True)

        Image Index  Binary_Label  Atelectasis  Cardiomegaly  Consolidation  \
0  00000001_000.png             1            0             1              0   
1  00000001_001.png             1            0             1              0   
2  00000001_002.png             1            0             1              0   
3  00000002_000.png             0            0             0              0   
4  00000003_000.png             1            0             0              0   

   Edema  Effusion  Emphysema  Fibrosis  Hernia  Infiltration  Mass  Nodule  \
0      0         0          0         0       0             0     0       0   
1      0         0          1         0       0             0     0       0   
2      0         1          0         0       0             0     0       0   
3      0         0          0         0       0             0     0       0   
4      0         0          0         0       1             0     0       0   

   Pleural_Thickening  Pneumonia  Pneumothorax  
0

- Save Labels as .npy Files

In [10]:
# Save Binary Labels
binary_labels = df["Binary_Label"].values  # Extract binary labels as NumPy array
np.save(os.path.join(OUTPUT_LABELS_DIR, "binary_labels.npy"), binary_labels)

# Save Multi-Label Classification Labels
multi_labels = df[all_diseases].values  # Extract multi-labels as NumPy array
np.save(os.path.join(OUTPUT_LABELS_DIR, "multi_labels.npy"), multi_labels)

print("✅ Labels saved as binary_labels.npy and multi_labels.npy")

✅ Labels saved as binary_labels.npy and multi_labels.npy


# Load and Preprocess Images

In [2]:
# Define Image Size and Output Directory
DATASET_PATH = "dataset" 
RESIZED_DIR = "resized_images"  # Folder to save resized images
CSV_PATH = os.path.join(DATASET_PATH, "Data_Entry_2017.csv")
IMAGE_DIRS = [os.path.join(DATASET_PATH, d) for d in os.listdir(DATASET_PATH) if d.startswith("images_")]
IMG_SIZE = (112, 112)  # Resize to 112x112

os.makedirs(RESIZED_DIR, exist_ok=True)  # Create folder if not exist

# Load metadata CSV
df = pd.read_csv(CSV_PATH)  # Update with your metadata file

# Function to check folder existence
def check_folders():
    for folder in IMAGE_DIRS:
        images_subdir = os.path.join(folder, "images")  # Always use os.path.join()
        if not os.path.exists(images_subdir):
            print(f"❌ 'images' folder is MISSING in {images_subdir}")
        else:
            print(f"✅ Found 'images' folder in {images_subdir}")

# Run the check
check_folders()

# Function to find image paths dynamically (inside "images" subfolder)
def get_image_path(image_index):
    """
    Constructs the image file path dynamically based on the known folder structure.
    """
    for folder in IMAGE_DIRS:  # Iterate over 'images_001' to 'images_012'
        img_folder = os.path.join(folder, "images")  # Navigate inside "images" subfolder
        img_path = os.path.join(img_folder, image_index)

        if os.path.exists(img_path):
            return img_path

# Function to resize and save images
def resize_and_save_images(df):
    unique_images = set()  # Track unique images
    
    for i, image_index in enumerate(df["Image Index"]):  # Assuming 'Image Index' column exists
        img_path = get_image_path(image_index)
        if img_path is None:
            print(f"❌ Warning: Image {image_index} not found in any folder.")
            continue  # Skip missing images

        try:
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Load as grayscale
            if img is None:
                print(f"❌ Warning: Could not read {img_path}")
                continue  # Skip if image can't be read

            img = cv2.resize(img, IMG_SIZE)  # Resize
            save_path = os.path.join(RESIZED_DIR, image_index + ".png")  # Ensure extension
            cv2.imwrite(save_path, img)  # Save image

            # Track unique images
            unique_images.add(img.tobytes())

            if i % 5000 == 0:
                print(f" Processed {i} images...")

        except Exception as e:
            print(f"⚠️ Error processing {img_path}: {e}")

    print(f"✅ Resizing complete! Unique images count: {len(unique_images)}")

# Function to load resized images
def load_resized_images(image_folder):
    images = []
    filenames = sorted(os.listdir(image_folder))  # Ensure order consistency

    for filename in filenames:
        img_path = os.path.join(image_folder, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Load resized image
        
        if img is None:
            print(f"❌ Error: Could not load {img_path}")
            continue

        img = img / 255.0  # Normalize pixel values
        images.append(img.flatten())  # Flatten for ML

    return np.array(images)

# Run the resizing function
resize_and_save_images(df)

# Load resized images
X = load_resized_images(RESIZED_DIR)
print("✅ Loaded resized images shape:", X.shape)

✅ Found 'images' folder in dataset\images_001\images
✅ Found 'images' folder in dataset\images_002\images
✅ Found 'images' folder in dataset\images_003\images
✅ Found 'images' folder in dataset\images_004\images
✅ Found 'images' folder in dataset\images_005\images
✅ Found 'images' folder in dataset\images_006\images
✅ Found 'images' folder in dataset\images_007\images
✅ Found 'images' folder in dataset\images_008\images
✅ Found 'images' folder in dataset\images_009\images
✅ Found 'images' folder in dataset\images_010\images
✅ Found 'images' folder in dataset\images_011\images
✅ Found 'images' folder in dataset\images_012\images
✅ Processed 0 images...
✅ Processed 5000 images...
✅ Processed 10000 images...
✅ Processed 15000 images...
✅ Processed 20000 images...
✅ Processed 25000 images...
✅ Processed 30000 images...
✅ Processed 35000 images...
✅ Processed 40000 images...
✅ Processed 45000 images...
✅ Processed 50000 images...
✅ Processed 55000 images...
✅ Processed 60000 images...
✅ Pro

# Apply PCA for Dimensionality Reduction

In [5]:
# ✅ Define Directories & Parameters
RESIZED_DIR = "resized_images"  # Directory containing resized images
BATCH_SIZE = 5000  # Adjust based on RAM size
N_COMPONENTS = 100  # Adjust based on experiments

# Get sorted filenames
image_files = sorted(os.listdir(RESIZED_DIR))

# Define PCA model
pca = IncrementalPCA(n_components=N_COMPONENTS, batch_size=BATCH_SIZE)

# ✅ First Pass: Fit PCA on Batches
print("Fitting PCA incrementally on image batches...")

for i in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing Batches", unit="batch"):
    batch_images = []

    for filename in image_files[i:i + BATCH_SIZE]:
        img_path = os.path.join(RESIZED_DIR, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

        # ✅ Fix: Ensure the image is loaded correctly
        if img is None:
            print(f"⚠️ Warning: Could not read {filename}, skipping.")
            continue

        img = img / 255.0  # Normalize
        img_flattened = img.flatten()

        # ✅ Fix: Skip images that are all zeros (empty)
        if np.all(img_flattened == 0):
            print(f"⚠️ Warning: {filename} is all zeros, skipping.")
            continue

        batch_images.append(img_flattened)

    # ✅ Fix: Ensure batch is not empty before calling PCA
    if len(batch_images) == 0:
        print(f"⚠️ Skipping empty batch {i // BATCH_SIZE + 1}.")
        continue

    batch_images = np.array(batch_images)
    
    # Fit PCA incrementally
    pca.partial_fit(batch_images)

print("✅ PCA fitting completed.")

# ✅ Second Pass: Transform Data using PCA
X_pca = []
print("Transforming images using trained PCA...")

for i in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Transforming Batches", unit="batch"):
    batch_images = []

    for filename in image_files[i:i + BATCH_SIZE]:
        img_path = os.path.join(RESIZED_DIR, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            continue

        img = img / 255.0
        img_flattened = img.flatten()

        if np.all(img_flattened == 0):
            continue

        batch_images.append(img_flattened)

    if len(batch_images) == 0:
        continue

    batch_images = np.array(batch_images)
    X_pca.append(pca.transform(batch_images))

# ✅ Convert to Array & Save
X_pca = np.vstack(X_pca)
X_pca = np.array(X_pca, dtype=np.float16)  # Reduce storage size
np.save("pca_features.npy", X_pca)

print("✅ PCA transformation completed and saved as 'pca_features.npy'.")

🔄 Fitting PCA incrementally on image batches...


Processing Batches: 100%|██████████████████████████████████████████████████████████| 23/23 [47:36<00:00, 124.19s/batch]


✅ PCA fitting completed.
🔄 Transforming images using trained PCA...


Transforming Batches: 100%|█████████████████████████████████████████████████████████| 23/23 [01:19<00:00,  3.44s/batch]

✅ PCA transformation completed and saved as 'pca_features.npy'.





# Train Binary Classification Model

In [2]:
# Define file paths
PCA_FEATURES_FILE = "pca_features.npy"  # Single file instead of batches
LABEL_PATH = "classification_labels" 
BINARY_LABELS_FILE = os.path.join(LABEL_PATH, "binary_labels.npy")

def train_svm_binary_classifier(
    X_pca, 
    y, 
    test_size=0.2, 
    random_state=42, 
    kernel='rbf', 
    C=1.0
):
    """
    Train an SVM Binary Classifier with progress tracking.
    
    Parameters:
    -----------
    X_pca : numpy.ndarray
        PCA-transformed feature matrix
    y : numpy.ndarray
        Binary labels
    test_size : float, optional (default=0.2)
        Proportion of the dataset to include in the test split
    random_state : int, optional (default=42)
        Controls the shuffling applied to the data before splitting
    kernel : str, optional (default='rbf')
        Specifies the kernel type to be used in the algorithm
    C : float, optional (default=1.0)
        Regularization parameter
    
    Returns:
    --------
    tuple: (trained_svm_model, training_metrics)
    """
    # Print initial data information
    print("\n Dataset Information:")
    print(f"Total Samples: {X_pca.shape[0]}")
    print(f"Feature Dimensions: {X_pca.shape[1]}")
    print(f"Label Distribution:\n{np.bincount(y)}")
    
    # Stratified train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_pca, y, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=y
    )
    
    # Print split information
    print("\n Data Split:")
    print(f"Training Samples: {X_train.shape[0]}")
    print(f"Test Samples: {X_test.shape[0]}")
    
    # Initialize SVM Classifier
    svm_model = SVC(
        kernel=kernel, 
        C=C, 
        random_state=random_state,
        probability=True  # Enable probability estimates
    )
    
    # Training with progress bar
    print("\n Training SVM Classifier...")
    with tqdm(total=1, desc="Training Progress", unit="model") as pbar:
        svm_model.fit(X_train, y_train)
        pbar.update(1)
    
    # Evaluation with progress bar
    print("\n Model Evaluation...")
    with tqdm(total=1, desc="Evaluation Progress", unit="eval") as pbar:
        y_pred = svm_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        pbar.update(1)
    
    # Print evaluation metrics
    print(f"\n✅ Model Accuracy: {accuracy:.4f}")
    print("\n Detailed Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Prepare training metrics
    training_metrics = {
        'accuracy': accuracy,
        'total_samples': X_pca.shape[0],
        'training_samples': X_train.shape[0],
        'test_samples': X_test.shape[0],
        'kernel': kernel,
        'C': C
    }
    
    return svm_model, training_metrics
    
# Load PCA-transformed features
X_pca = np.load(PCA_FEATURES_FILE)
print("Loaded PCA Features Shape:", X_pca.shape)
    
# Load binary labels (Finding vs. No Finding)
y = np.load(BINARY_LABELS_FILE)
print("Labels Shape:", y.shape)
    
# Ensure feature and label count match
assert X_pca.shape[0] == y.shape[0], "Error: Features and labels count mismatch!"
    
# Train SVM Classifier
svm_model, metrics = train_svm_binary_classifier(X_pca, y)
    
# Save Trained Model
joblib.dump(svm_model, "svm_binary_model.pkl")
print("\n✅ Model saved as 'svm_binary_model.pkl'.")

Loaded PCA Features Shape: (112120, 100)
Labels Shape: (112120,)

 Dataset Information:
Total Samples: 112120
Feature Dimensions: 100
Label Distribution:
[60361 51759]

 Data Split:
Training Samples: 89696
Test Samples: 22424

 Training SVM Classifier...


Training Progress: 100%|██████████████████████████████████████████████████████████| 1/1 [1:30:53<00:00, 5453.68s/model]



 Model Evaluation...


Evaluation Progress: 100%|████████████████████████████████████████████████████████████| 1/1 [02:31<00:00, 151.31s/eval]


✅ Model Accuracy: 0.6593

 Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.75      0.70     12072
           1       0.66      0.55      0.60     10352

    accuracy                           0.66     22424
   macro avg       0.66      0.65      0.65     22424
weighted avg       0.66      0.66      0.66     22424


✅ Model saved as 'svm_binary_model.pkl'.





# Train Multi-Label Classification Model 

In [4]:
# Directories
IMAGE_DIR = "resized_images"  # Directory containing resized images
LABEL_PATH = "classification_labels"
MULTILABEL_FILE = os.path.join(LABEL_PATH, "multi_labels.npy")

# Image Preprocessing Parameters
IMAGE_SIZE = (112, 112)  # Ensure this matches your resizing process

# Load all images as flattened feature vectors
image_files = sorted(os.listdir(IMAGE_DIR))  # Ensure order matches labels
X_images = []

print("Loading and processing images...")
for img_file in tqdm(image_files, desc="Processing Images", unit="img"):
    img_path = os.path.join(IMAGE_DIR, img_file)
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Load as grayscale
    img = cv2.resize(img, IMAGE_SIZE)  # Resize if needed
    X_images.append(img.flatten())  # Flatten into 1D vector

# Convert to NumPy array
X_images = np.array(X_images, dtype=np.float32) / 255.0  # Normalize pixel values
print("Loaded Image Data Shape:", X_images.shape)

# Load multi-label classification labels
y_multi = np.load(MULTILABEL_FILE)  # Ensure labels match image count
print("Labels Shape:", y_multi.shape)

# Ensure feature and label count match
assert X_images.shape[0] == y_multi.shape[0], "Error: Image count and labels count mismatch!"

# Debug Function
# print(f"Number of images: {len(image_files)}, Number of labels: {len(y_multi)}")
# print("Label Distribution:\n", np.sum(y_multi, axis=0))  # Sum labels per class
# print("Unique values in labels:", np.unique(y_multi))
# print("Mean pixel values per image:", np.mean(X_images, axis=1)[:10])  # Check first 10 images
# print("Standard deviation per image:", np.std(X_images, axis=1)[:10])
# print("Are all images identical?", np.all(X_images[0] == X_images))


# Split Data: 80% Train, 20% Test
X_train, X_test, y_train, y_test = train_test_split(X_images, y_multi, test_size=0.2, random_state=42)

def train_multi_output_random_forest(X_train, y_train, n_estimators=50, random_state=42):
    """
    Train a Multi-Output Random Forest Classifier with a progress bar.
    
    Parameters:
    -----------
    X_train : array-like of shape (n_samples, n_features)
        Training data features
    y_train : array-like of shape (n_samples, n_outputs)
        Training data multi-label targets
    n_estimators : int, optional (default=50)
        Number of trees in the forest
    random_state : int, optional (default=42)
        Controls randomness of the model
    
    Returns:
    --------
    MultiOutputClassifier: Trained multi-output random forest model
    """
    # Create the multi-output classifier
    rf_model = MultiOutputClassifier(
        RandomForestClassifier(
            n_estimators=n_estimators, 
            random_state=random_state,
            n_jobs=-1  # Use all available cores
        )
    )
    
    # Wrap the fitting process with tqdm
    with tqdm(total=1, desc="Training Multi-Output Random Forest", unit="model") as pbar:
        rf_model.fit(X_train, y_train)
        pbar.update(1)
    
    print("\n✅ Training Completed Successfully!")
    
    # Print additional training information
    print("\n🔍 Training Details:")
    print(f"- Number of estimators: {n_estimators}")
    print(f"- Training data shape: {X_train.shape}")
    print(f"- Number of output labels: {y_train.shape[1]}")
    
    return rf_model

rf_model = train_multi_output_random_forest(X_train, y_train)

# Evaluate Model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, zero_division=1))

# Check label distribution
print("Label distribution in training data:")
print(np.sum(y_train, axis=0))
print("\nLabel distribution in test data:")
print(np.sum(y_test, axis=0))

# Save Trained Model
joblib.dump(rf_model, "random_forest_multi_label.pkl")
print("✅ Multi-Label Model saved as 'random_forest_multi_label.pkl'.")

Loading and processing images...


Processing Images: 100%|█████████████████████████████████████████████████████| 112120/112120 [09:51<00:00, 189.59img/s]


Loaded Image Data Shape: (112120, 12544)
Labels Shape: (112120, 14)


Training Multi-Output Random Forest: 100%|██████████████████████████████████████████| 1/1 [38:46<00:00, 2326.68s/model]



✅ Training Completed Successfully!

🔍 Training Details:
- Number of estimators: 50
- Training data shape: (89696, 12544)
- Number of output labels: 14
✅ Model Accuracy: 0.5394
              precision    recall  f1-score   support

           0       0.21      0.00      0.01      2225
           1       1.00      0.00      0.00       557
           2       0.00      0.00      0.00       894
           3       1.00      0.00      0.00       446
           4       0.44      0.02      0.04      2625
           5       0.00      0.00      0.00       487
           6       0.00      0.00      0.00       319
           7       1.00      0.00      0.00        46
           8       0.39      0.02      0.03      3951
           9       0.18      0.00      0.00      1141
          10       0.00      0.00      0.00      1285
          11       0.00      0.00      0.00       663
          12       0.00      0.00      0.00       294
          13       0.17      0.00      0.00      1046

   micro av

# Load and Use Model

In [None]:
# Load PCA model
pca_loaded = joblib.load("pca_model.pkl")

# Load binary classification SVM model
svm_loaded = joblib.load("svm_model.pkl")

# Load multi-label classification model
rf_loaded = joblib.load("random_forest_multi_label.pkl")

# Predict on a new sample (assume X_new_features is preprocessed)
y_new_pred = rf_model.predict(X_new_features)
print("Predicted Labels:", y_new_pred)