In [1]:
!pip install python-gdcm pylibjpeg --no-index --find-links "../input/rsna-wheels/"

Looking in links: ../input/rsna-wheels/
Processing /kaggle/input/rsna-wheels/python_gdcm-3.0.20-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Processing /kaggle/input/rsna-wheels/pylibjpeg-1.4.0-py3-none-any.whl
Installing collected packages: python-gdcm, pylibjpeg
Successfully installed pylibjpeg-1.4.0 python-gdcm-3.0.20
[0m

In [2]:
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -qr requirements.txt  # install

import torch

device = torch.device('cuda')

Cloning into 'yolov5'...
fatal: unable to access 'https://github.com/ultralytics/yolov5/': Could not resolve host: github.com
[Errno 2] No such file or directory: 'yolov5'
/kaggle/working
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.metrics import f1_score
import torch.nn as nn
from torchvision import transforms
import torch.optim as optim
import numpy as np
import os
import pydicom
from pydicom.pixel_data_handlers import apply_windowing
import torch.nn.functional as F
from torchmetrics.classification import BinaryF1Score

In [4]:
roi_extractor_model = torch.hub.load('/kaggle/input/yolov5-repo', 'custom', path='/kaggle/input/rsna-breast-cancer-detection-roi-model/rsna-roi-003.pt', source='local')

YOLOv5 🚀 2022-12-23 Python-3.7.12 torch-1.11.0 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)

Fusing layers... 
Model summary: 157 layers, 7012822 parameters, 0 gradients
Adding AutoShape... 


### 

In [5]:
# helper functions
def read_dicom_with_windowing(dcm_file):
    # from: https://www.kaggle.com/code/davidbroberts/mammography-apply-windowing/
    im = pydicom.dcmread(dcm_file)
    data = im.pixel_array
    
    # This line is the only difference in the two functions
    data = apply_windowing(data, im)
    
    if im.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    else:
        data = data - np.min(data)
        
    if np.max(data) != 0:
        data = data / np.max(data)
    data=(data * 255).astype(np.uint8)

    return data

In [6]:
def crop_yolo_ROI(image):
    # https://www.kaggle.com/code/remekkinas/breast-cancer-roi-brest-extractor/notebook
    detections = roi_extractor_model(image)
    
    bbox_df = detections.pandas().xyxy[0]
    bbox_df.drop(["name"], axis=1, inplace=True)
    bboxes = bbox_df.astype(int).to_dict(orient="records")
    
    if len(bboxes) == 0:
        return image
    
    if len(bboxes) != 1:
        print(bboxes)
        print("More than one ROI detected")

    bbox = bboxes[0]
    
    image = image[bbox["ymin"]:bbox["ymax"], bbox["xmin"]:bbox["xmax"]]
    return image

In [7]:
class BreastCancerDataset(torch.utils.data.Dataset):
    # image to cancer
    def __init__(self, df, dataset_folder, transform=None):
        self.dataset_folder = dataset_folder
        self.transform = transform
        
        if "breast_id" not in df.columns:
            df["breast_id"] = df["patient_id"].astype(str) + "_" + df["laterality"]

        if "image_path" not in df.columns:
            df["image_path"] = df["patient_id"].astype(str) + "/" + df["image_id"].astype(str) + ".dcm"

        self.df = df
        if 'cancer' in self.df.columns:
            self.subset = 'train'
        else:
            self.subset = 'test'

    def __len__(self):
        return len(self.df.groupby("breast_id"))

    def __getitem__(self, idx):
        # return MLO and CC images from given breast
        breast_id = self.df["breast_id"].unique()[idx]
        breast_df = self.df[self.df["breast_id"] == breast_id]
        
        MLO = breast_df[breast_df["view"] == "MLO"]
        CC = breast_df[breast_df["view"] == "CC"]

        MLO_image_path = os.path.join(self.dataset_folder, self.subset+ "_images", MLO["image_path"].values[0])
        CC_image_path = os.path.join(self.dataset_folder, self.subset+ "_images", CC["image_path"].values[0])

        MLO_image = read_dicom_with_windowing(MLO_image_path)
        CC_image = read_dicom_with_windowing(CC_image_path)
        
        MLO_image = crop_yolo_ROI(MLO_image)
        CC_image = crop_yolo_ROI(CC_image)

        MLO_image = torch.from_numpy(MLO_image)
        CC_image = torch.from_numpy(CC_image)
        
        if self.transform:  # normalization and augmentation are in here
            MLO_image = self.transform(MLO_image)
            CC_image = self.transform(CC_image)
        
        # add these images into a 2 channel image
        image = torch.cat([MLO_image, CC_image], dim=0)

        if self.subset == 'train':
            label = breast_df['cancer'].values[0]
            return image, label
        
        
        return image

In [8]:
train_df = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv')
test_df = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/test.csv')

dataset_folder = '/kaggle/input/rsna-breast-cancer-detection'

transform = transforms.Compose([ # also add augmentation
    transforms.ToPILImage(),
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
])

train_dataset = BreastCancerDataset(train_df, dataset_folder, transform=transform)
test_dataset = BreastCancerDataset(test_df, dataset_folder, transform=transform)

In [9]:
print(test_df)
print(train_df.columns)

   site_id  patient_id    image_id laterality view  ...  implant  machine_id  \
0        2       10008   736471439          L  MLO  ...        0          21   
1        2       10008  1591370361          L   CC  ...        0          21   
2        2       10008    68070693          R  MLO  ...        0          21   
3        2       10008   361203119          R   CC  ...        0          21   

   prediction_id breast_id            image_path  
0        10008_L   10008_L   10008/736471439.dcm  
1        10008_L   10008_L  10008/1591370361.dcm  
2        10008_R   10008_R    10008/68070693.dcm  
3        10008_R   10008_R   10008/361203119.dcm  

[4 rows x 11 columns]
Index(['site_id', 'patient_id', 'image_id', 'laterality', 'view', 'age',
       'cancer', 'biopsy', 'invasive', 'BIRADS', 'implant', 'density',
       'machine_id', 'difficult_negative_case', 'breast_id', 'image_path'],
      dtype='object')


In [10]:
# plot first batch
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 4, shuffle=False)
images, labels = next(iter(train_loader))

print(images.shape, labels.shape)

torch.Size([32, 2, 100, 100]) torch.Size([32])


In [11]:

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        input_size = 100*100*.25*.25*12
        self.conv1 = nn.Conv2d(2, 6, 5, padding='same')
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 12, 5, padding='same')
        self.fc1 = nn.Linear(int(input_size), int(input_size * .33))
        #self.fc2 = nn.Linear(int(input_size*.33), int(input_size * .33**2))
        self.fc3 = nn.Linear(int(input_size * .33), 1)
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.sigmoid(self.fc3(x))
        x = torch.flatten(x)
        return x

In [12]:
cnn_model = CNN().to(device)
criterion = nn.BCELoss()
optimizer = optim.SGD(cnn_model.parameters(), lr=.001, momentum=.9)

In [13]:
num_epochs = 1
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    f1 = 0
    for idx, (image, target) in enumerate(train_loader):
        target = target.to(device)
        image = image.to(device)
        outputs=cnn_model(image)

        loss = criterion(outputs, target.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        output_binary = torch.Tensor.cpu(outputs).detach().numpy()
        output_binary[output_binary > .5] = 1
        output_binary[output_binary<=.5] = 0
        f1 += f1_score(output_binary, torch.Tensor.cpu(target).numpy())
        if idx%5 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], {idx+1}/{n_total_steps}, Loss: {loss.item():.4f}, F1: {f1/(5):.4f}')
            f1 = 0
        if idx > 110:
            break

Epoch [1/1], 1/745, Loss: 0.6945, F1: 0.0000
Epoch [1/1], 6/745, Loss: 0.6676, F1: 0.0000
Epoch [1/1], 11/745, Loss: 0.5997, F1: 0.0000
Epoch [1/1], 16/745, Loss: 0.5345, F1: 0.0000
Epoch [1/1], 21/745, Loss: 0.4078, F1: 0.0000
Epoch [1/1], 26/745, Loss: 0.2120, F1: 0.0000
Epoch [1/1], 31/745, Loss: 0.0822, F1: 0.0000
[{'xmin': 1853, 'ymin': 551, 'xmax': 3304, 'ymax': 2880, 'confidence': 0, 'class': 0}, {'xmin': 2145, 'ymin': 2975, 'xmax': 3328, 'ymax': 4096, 'confidence': 0, 'class': 0}]
More than one ROI detected
Epoch [1/1], 36/745, Loss: 0.0218, F1: 0.0000
Epoch [1/1], 41/745, Loss: 0.0089, F1: 0.0000
Epoch [1/1], 46/745, Loss: 0.3428, F1: 0.0000
Epoch [1/1], 51/745, Loss: 0.0086, F1: 0.0000
Epoch [1/1], 56/745, Loss: 0.0104, F1: 0.0000
Epoch [1/1], 61/745, Loss: 0.2685, F1: 0.0000
Epoch [1/1], 66/745, Loss: 0.2739, F1: 0.0000
Epoch [1/1], 71/745, Loss: 0.0212, F1: 0.0000
[{'xmin': 950, 'ymin': 793, 'xmax': 1905, 'ymax': 2268, 'confidence': 0, 'class': 0}, {'xmin': 902, 'ymin': 35,

In [14]:
with torch.no_grad():
    for images in test_loader:
        images = images.to(device)
        outputs = cnn_model(images)
outputs = torch.Tensor.cpu(outputs).numpy()
print(outputs)
submission = {}
submission['prediction_id'] = ['10008_L', '10008_R']
submission['cancer'] = outputs
submission_df = pd.DataFrame(submission)
print(submission_df)
submission_df.to_csv('submission.csv', index=False)


[  0.0069687    0.010756]
  prediction_id    cancer
0       10008_L  0.006969
1       10008_R  0.010756
