# Downlaod Dataset

In [1]:
import kagglehub
import os

# download dataset if not already
if not os.path.exists('/root/.cache/kagglehub/datasets/paultimothymooney/chest-xray-pneumonia/versions/2'):

  path = kagglehub.dataset_download('paultimothymooney/chest-xray-pneumonia')
  path = os.path.join(path, 'chest_xray')
  os.listdir(os.path.join(path, 'val'))
  print('file downloaded at:', path)
else:
  path = os.path.join('/root/.cache/kagglehub/datasets/paultimothymooney/chest-xray-pneumonia/versions/2', 'chest_xray')
  print('file existed at:', path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/paultimothymooney/chest-xray-pneumonia?dataset_version_number=2...


100%|██████████| 2.29G/2.29G [00:29<00:00, 83.3MB/s]

Extracting files...





file downloaded at: /root/.cache/kagglehub/datasets/paultimothymooney/chest-xray-pneumonia/versions/2/chest_xray


In [None]:
os.listdir(os.path.join(path, 'chest_xray'))

['test', 'train', '.DS_Store', 'val']

In [3]:
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.metrics import accuracy_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
class PneumoniaDataset(Dataset):
  def __init__(self, root_dir, transform=None):
    self.root_dir = root_dir
    self.transform = transform
    self.image_path = []
    self.labels = []

    for label in ['NORMAL', 'PNEUMONIA']:
      folder_dir = os.path.join(root_dir, label)
      for file_name in os.listdir(folder_dir):
        self.image_path.append(os.path.join(folder_dir, file_name))
        self.labels.append(0 if label == 'NORMAL' else 1)

  def __getitem__(self, index):
    image_path = self.image_path[index]
    image = Image.open(image_path).convert('RGB')
    label = self.labels[index]

    if self.transform:
      image = self.transform(image)

    return image, label

  def __len__(self):
    return len(self.image_path)

In [5]:
# necessary for resnet18
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [6]:
train_dataset = PneumoniaDataset(os.path.join(path, 'train'), transform=transform)
test_dataset = PneumoniaDataset(os.path.join(path, 'test'), transform=transform)
val_dataset = PneumoniaDataset(os.path.join(path, 'val'), transform=transform)

In [7]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [8]:
# load pre-trained resnet18 model
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, 2) # neuron for normal and pneumonia
model = model.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 136MB/s]


In [9]:
# training function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

for epoch in range(num_epochs):
  model.train()
  running_loss = 0.0

  for images, labels in train_loader:
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()

    logits = model(images)
    loss = criterion(logits, labels)

    loss.backward()
    optimizer.step()

    running_loss += loss

  print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")

  # validate the model
  model.eval()
  val_labels = []
  val_preds = []

  for images, labels in val_loader:
    with torch.no_grad():
      images = images.to(device)
      labels = labels.to(device)

      logits = model(images)

      _, preds = torch.max(logits, 1)

      val_labels.extend(labels.cpu().numpy())
      val_preds.extend(preds.cpu().numpy())

  val_accuracy = accuracy_score(val_labels, val_preds)
  print('Validation accuracy:', val_accuracy)

Epoch 1/10, Loss: 0.13701067864894867
Validation accuracy: 0.9375
Epoch 2/10, Loss: 0.05570507422089577
Validation accuracy: 0.75
Epoch 3/10, Loss: 0.04978053271770477
Validation accuracy: 0.875
Epoch 4/10, Loss: 0.04051050543785095
Validation accuracy: 0.875
Epoch 5/10, Loss: 0.02736072614789009
Validation accuracy: 0.75
Epoch 6/10, Loss: 0.023962484672665596
Validation accuracy: 0.875
Epoch 7/10, Loss: 0.02764562889933586
Validation accuracy: 0.625
Epoch 8/10, Loss: 0.028442004695534706
Validation accuracy: 0.8125
Epoch 9/10, Loss: 0.020580146461725235
Validation accuracy: 0.6875
Epoch 10/10, Loss: 0.003951539751142263
Validation accuracy: 0.9375


In [10]:
# test the model on unseen data
model.eval()
test_labels = []
test_preds = []

for images, labels in test_loader:
  images = images.to(device)
  labels = labels.to(device)

  with torch.no_grad():
    logits = model(images)
  _, preds = torch.max(logits, 1)

  test_labels.extend(labels.cpu().numpy())
  test_preds.extend(preds.cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_preds)
print('Test accuracy:', test_accuracy)

Test accuracy: 0.7451923076923077


In [11]:
# save the model
torch.save(model.state_dict(), 'pneumonia_classifier.pth')