In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
import time
from sklearn.preprocessing import LabelEncoder
from skimage.feature import graycomatrix, graycoprops
from skimage.feature import hog, local_binary_pattern, graycomatrix, graycoprops
from skimage.filters import gabor


In [2]:
# Download dataset
import kagglehub
path = kagglehub.dataset_download("fatemehmehrparvar/lung-disease")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/fatemehmehrparvar/lung-disease?dataset_version_number=1...


100%|██████████| 34.9M/34.9M [00:00<00:00, 51.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/fatemehmehrparvar/lung-disease/versions/1


In [3]:
def load_images_from_folder(folder, label, augment=False):
    images = []
    labels = []

    for filename in os.listdir(folder):
        # Indent the following lines to be part of the for loop
        if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        img_path = os.path.join(folder, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            # Preprocessing
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            img = cv2.bilateralFilter(img, d=9, sigmaColor=75, sigmaSpace=75)
            img = cv2.equalizeHist(img)  # Histogram equalization

            images.append(img)
            labels.append(label)

            if augment:
                # More sophisticated augmentation
                augmentations = [
                    cv2.flip(img, 1),  # Horizontal flip
                    cv2.flip(img, 0),  # Vertical flip
                    cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE),
                    cv2.GaussianBlur(img, (5,5), 0.5),
                    # Add the beta parameter (weight for the second image) and gamma
                    cv2.addWeighted(img, 0.8, cv2.GaussianBlur(img, (5,5), 0.2, 0), 0.2, 0)  # Mixup-like
                ]

                for aug_img in augmentations:
                    images.append(aug_img)
                    labels.append(label)

    return images, labels

In [4]:
def extract_color_histogram(images, bins=32):
    """Extract color histogram features"""
    features_list = []

    for img in images:
        if len(img.shape) == 2:  # Grayscale image
            hist = cv2.calcHist([img], [0], None, [bins], [0, 256])
            hist = hist.flatten()
        else:  # Color image
            # Compute histogram for each channel
            hist_r = cv2.calcHist([img], [0], None, [bins], [0, 256])
            hist_g = cv2.calcHist([img], [1], None, [bins], [0, 256])
            hist_b = cv2.calcHist([img], [2], None, [bins], [0, 256])

            # Concatenate and normalize
            hist = np.concatenate([hist_r, hist_g, hist_b]).flatten()

        # Normalize histogram
        hist = hist / (hist.sum() + 1e-7)
        features_list.append(hist)

    features=np.array(features_list)
    features = np.nan_to_num(features)

    # Dimensionality reduction
    pca = PCA(n_components=0.99)
    # Return both the transformed features and the fitted pca object
    return pca.fit_transform(features), pca

In [5]:
from pathlib import Path
base_dir = Path(path) / 'Lung X-Ray Image' / 'Lung X-Ray Image'
categories = ['Lung_Opacity', 'Normal', 'Viral Pneumonia']
X, y = [], []


In [6]:
IMG_SIZE = 200
RANDOM_STATE = 42

for category in categories:
    folder = os.path.join(base_dir, category)
    augment = True  # Augment all classes
    images, labels = load_images_from_folder(folder, category, augment=augment)
    X.extend(images)
    y.extend(labels)

X = np.array(X)
y = np.array(y)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)



In [7]:
# Extract features
print("Extracting combined features...")
X_features, feature_pca = extract_color_histogram(X)



Extracting combined features...


In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [9]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
rf_model=RandomForestClassifier(bootstrap=False, max_depth=None,
                                min_samples_split=5, n_estimators=300)
svm_model=SVC(C=10, gamma=0.1, kernel='rbf', probability=True)

voting = VotingClassifier(
    estimators=[('rf', rf_model), ('svm', svm_model)],
    voting='soft'
)
voting.fit(X_train_scaled, y_train)
y_pred = voting.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=categories))

Accuracy: 0.9691
                 precision    recall  f1-score   support

   Lung_Opacity       0.98      0.96      0.97      1350
         Normal       0.96      0.99      0.97      1500
Viral Pneumonia       0.98      0.96      0.97      1320

       accuracy                           0.97      4170
      macro avg       0.97      0.97      0.97      4170
   weighted avg       0.97      0.97      0.97      4170



In [12]:
import pickle
import os
import joblib
# Import drive for Google Colab
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the base path to your Google Drive
drive_path = '/content/drive/MyDrive' # This is the default mount point

# Define the directory within your Drive where you want to save the model
model_dir_in_drive = os.path.join(drive_path, 'GP', 'Lung Disease')

# Ensure the directory exists in Google Drive
os.makedirs(model_dir_in_drive, exist_ok=True)

# Define the full path to the pickle file
pkl_path = os.path.join(model_dir_in_drive, "voting.pkl")

# Save as a pickle file
with open(pkl_path, 'wb') as f:
    pickle.dump(voting, f)  # Save the 'voting' model object

# After training in your notebook, add:
joblib.dump(feature_pca, '/content/drive/MyDrive/GP/Lung Disease/feature_pca.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/GP/Lung Disease/scaler.pkl')
joblib.dump(label_encoder, '/content/drive/MyDrive/GP/Lung Disease/label_encoder.pkl')

print("Model saved successfully to:", pkl_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved successfully to: /content/drive/MyDrive/GP/Lung Disease/voting.pkl
