# Disease Detection from Chest X-rays using CNN
This notebook loads the NIH Chest X-ray dataset, preprocesses images, builds a CNN model using TensorFlow, and trains it to detect diseases from X-ray scans.

In [1]:

# ✅ STEP 1: Import Required Libraries
import os
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [2]:

# ✅ STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:

# ✅ STEP 3: Load Labels from CSV
csv_path = '/content/drive/MyDrive/disease-detection/dataset/Data_Entry_2017.csv'
labels_df = pd.read_csv(csv_path)
labels_df = labels_df[['Image Index', 'Finding Labels']]
labels_df['Finding Labels'] = labels_df['Finding Labels'].apply(lambda x: x.split('|')[0])
labels_df = labels_df[labels_df['Finding Labels'] != 'No Finding']
labels_df = labels_df.drop_duplicates(subset='Image Index')
labels_df.head()


Unnamed: 0,Image Index,Finding Labels
0,00000001_000.png,Cardiomegaly
1,00000001_001.png,Cardiomegaly
2,00000001_002.png,Cardiomegaly
4,00000003_001.png,Hernia
5,00000003_002.png,Hernia


In [4]:

# ✅ STEP 4: Preprocess Images (first 5000 only for demo)
from tqdm import tqdm  # progress bar

IMG_SIZE = 128
image_dir = '/content/drive/MyDrive/disease-detection/dataset/images/'
data = []
labels = []

used_labels = labels_df['Finding Labels'].unique().tolist()
label_map = {label: i for i, label in enumerate(used_labels)}

LIMIT = 5000  # Load only first 2000 for faster demo

for idx, row in tqdm(labels_df.iterrows(), total=LIMIT):
    img_path = os.path.join(image_dir, row['Image Index'])
    if os.path.exists(img_path):
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        data.append(img)
        labels.append(label_map[row['Finding Labels']])
    if len(data) >= LIMIT:
        break

data = np.array(data).reshape(-1, IMG_SIZE, IMG_SIZE, 1) / 255.0
labels = to_categorical(labels, num_classes=len(used_labels))


100%|█████████▉| 4999/5000 [35:50<00:00,  2.32it/s]


In [5]:

# ✅ STEP 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


In [6]:

# ✅ STEP 6: Build CNN Model
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(used_labels), activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:

# ✅ STEP 7: Train the Model
datagen = ImageDataGenerator(rotation_range=15, zoom_range=0.1, width_shift_range=0.1, height_shift_range=0.1)
datagen.fit(X_train)
history = model.fit(datagen.flow(X_train, y_train, batch_size=32),
                    validation_data=(X_test, y_test),
                    epochs=50)


Epoch 1/50


  self._warn_if_super_not_called()


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 860ms/step - accuracy: 0.2057 - loss: 2.5942 - val_accuracy: 0.2680 - val_loss: 2.2934
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 811ms/step - accuracy: 0.2411 - loss: 2.3266 - val_accuracy: 0.2370 - val_loss: 2.2371
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 829ms/step - accuracy: 0.2430 - loss: 2.2808 - val_accuracy: 0.2540 - val_loss: 2.3072
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 858ms/step - accuracy: 0.2514 - loss: 2.2944 - val_accuracy: 0.2480 - val_loss: 2.2192
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 809ms/step - accuracy: 0.2374 - loss: 2.2715 - val_accuracy: 0.2590 - val_loss: 2.2041
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 785ms/step - accuracy: 0.2366 - loss: 2.2902 - val_accuracy: 0.2710 - val_loss: 2.2102
Epoch 7/50
[1m1

In [12]:

# ✅ STEP 8: Save the Model to Google Drive
save_path = '/content/drive/MyDrive/disease-detection/model/chest_disease_model.h5'
model.save(save_path)
print("Model saved to Google Drive:", save_path)




Model saved to Google Drive: /content/drive/MyDrive/disease-detection/model/chest_disease_model.h5
