In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Breast Cancer Detection Using Computer Vision 

### Goal 
The goal of this project is to use computer vision (via PyTorch) to detect breast cancer, using mammogram imaging with ResNet18

In [2]:
import os
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt


In [3]:
#configuration 
def get_config():
    return {
        'base_dir': '/kaggle/input/breast-cancer-detection', #where the images are coming from
        'img_size': 224, #desired size to standardize images
        'batch_size': 32, #how many batches per epoch
        'learning_rate': 1e-4,
        'num_epochs': 10,
        'device': torch.device("cuda" if torch.cuda.is_available() else "cpu") #use GPU if available else CPU
    }


In [4]:
#load the data from the folders
def load_data(cfg):
    data_transforms = {
        'train': transforms.Compose([
            transforms.Resize((cfg['img_size'], cfg['img_size'])),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ]),
        'valid': transforms.Compose([
            transforms.Resize((cfg['img_size'], cfg['img_size'])),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ]),
        'test': transforms.Compose([
            transforms.Resize((cfg['img_size'], cfg['img_size'])),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ])
    }

    loaders = {}
    datasets_dict = {}

    for phase in ['train', 'valid', 'test']:
        path = os.path.join(cfg['base_dir'], phase)
        dataset = datasets.ImageFolder(path, transform=data_transforms[phase])
        loaders[phase] = DataLoader(dataset, batch_size=cfg['batch_size'], shuffle=(phase == 'train'))
        datasets_dict[phase] = dataset

    # Count class imbalance in training set
    train_labels = [label for _, label in datasets_dict['train']]
    neg_count = train_labels.count(0)
    pos_count = train_labels.count(1)
    pos_weight = torch.tensor([neg_count / pos_count], dtype=torch.float32).to(cfg['device'])

    return loaders, pos_weight


In [5]:
#build model
def build_model(cfg):
    model = models.resnet18(pretrained=True)

    for param in model.parameters():
        param.requires_grad = False  # Freeze all layers except final FC

    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, 1)  # Binary classification

    return model.to(cfg['device'])


In [6]:
#Teain model
def train_model(model, loaders, cfg, pos_weight):
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.fc.parameters(), lr=cfg['learning_rate'])

    for epoch in range(cfg['num_epochs']):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        for inputs, labels in loaders['train']:
            inputs = inputs.to(cfg['device'])
            labels = labels.float().unsqueeze(1).to(cfg['device'])

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        print(f"Epoch {epoch+1}: Loss = {running_loss:.4f}, Accuracy = {100 * correct / total:.2f}%")


In [7]:
#Evaluate model
def evaluate_model(model, loader, cfg):
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for inputs, labels in loader:
            inputs = inputs.to(cfg['device'])
            labels = labels.to(cfg['device'])

            outputs = model(inputs)
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).int()

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())

    print("\nTest Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['No Cancer', 'Cancer']))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))


In [8]:
#run the pipeline
def run_pipeline():
    cfg = get_config()
    loaders, pos_weight = load_data(cfg)
    model = build_model(cfg)
    train_model(model, loaders, cfg, pos_weight)
    evaluate_model(model, loaders['test'], cfg)


In [9]:
run_pipeline()

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 148MB/s] 


Epoch 1: Loss = 71.3764, Accuracy = 50.97%
Epoch 2: Loss = 70.0015, Accuracy = 50.21%
Epoch 3: Loss = 69.5610, Accuracy = 50.25%
Epoch 4: Loss = 68.8085, Accuracy = 53.16%
Epoch 5: Loss = 68.7745, Accuracy = 52.95%
Epoch 6: Loss = 68.3543, Accuracy = 52.32%
Epoch 7: Loss = 68.3010, Accuracy = 56.32%
Epoch 8: Loss = 67.9897, Accuracy = 55.06%
Epoch 9: Loss = 68.2771, Accuracy = 54.89%
Epoch 10: Loss = 67.0351, Accuracy = 54.93%

Test Classification Report:
              precision    recall  f1-score   support

   No Cancer       0.67      0.55      0.60       208
      Cancer       0.43      0.55      0.48       128

    accuracy                           0.55       336
   macro avg       0.55      0.55      0.54       336
weighted avg       0.58      0.55      0.56       336

Confusion Matrix:
[[114  94]
 [ 57  71]]
