In [1]:
import os 

In [2]:
os.chdir('../')

In [3]:
from pydicom import dcmread
import pandas as pd 
import numpy as np
from processing import MammographyPreprocessor
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
from torchmetrics import Accuracy

In [4]:
# getting a list of all the training data folders 
training_data_paths = [f.path for f in os.scandir('data/train_images') if f.is_dir()]
training_csv = pd.read_csv('data/train.csv')
test_csv = pd.read_csv('data/test.csv')

In [22]:
train_csv

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,0,C,49,False
54702,1,9989,63473691,L,MLO,60.0,0,0,0,,0,C,216,False
54703,1,9989,1078943060,L,CC,60.0,0,0,0,,0,C,216,False
54704,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,0,C,216,True


In [21]:
train_csv = pd.read_csv('data/train.csv')

In [6]:
class XRayDataset(Dataset):
    def __init__(self, base_dir, image_ids):
        self.base_dir = base_dir
        self.image_ids = image_ids
        train_csv = pd.read_csv(f'{base_dir}/train.csv')
        train_csv.index = train_csv['image_id']
        self.df = train_csv
        
        
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, i):
        image_id = self.image_ids[i]
        patient_id = self.df.loc[image_id]['patient_id']
        xray = Image.open(f'{self.base_dir}/train_images/{patient_id}/{image_id}.png')
        label = self.df.loc[image_id]['cancer']
        return torch.tensor(np.array(xray)/255, dtype=torch.float)[None, :], torch.tensor(label, dtype=torch.long)

In [7]:
train_dataset = XRayDataset('data', train_csv['image_id'])

In [9]:
X = train_csv['image_id']
y = train_csv['cancer']
from sklearn.model_selection import train_test_split
X_train_total, X_test, y_train_total, y_tetst = train_test_split(X, y, test_size=.25)
X_train, X_val, y_train, y_val = train_test_split(X_train_total, y_train_total, test_size=.25)

In [18]:
trainset = XRayDataset('data', list(X_train.values))
testset = XRayDataset('data', list(X_test.values))
valset = XRayDataset('data', list(X_val.values))
bs = 64
train_loader, test_loader = DataLoader(trainset, batch_size=64), DataLoader(testset, batch_size=64)
val_loader = DataLoader(valset, batch_size=bs)

In [11]:
sample = next(iter(train_loader))

In [12]:
sample[0][0].shape

torch.Size([1, 256, 128])

In [13]:
from models import ResNet

In [14]:
resnet = ResNet(depth=56, block_name='BottleNeck', num_classes=2, device='cpu')

In [15]:
import pytorch_lightning as pl 
import torch.nn as nn
import torch 

In [23]:
class PLResNet(pl.LightningModule):
    def __init__(self, resnet):
        super().__init__()
        self.resnet = resnet
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self, x):
        return self.resnet(x)
        
    def training_step(self, batch, batch_idx):
        x, y = batch 
        logits = self.resnet(x)
        loss = self.criterion(logits, y)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch 
        logits = self(x)
        loss = self.criterion(logits, y)
        pred = torch.argmax(logits, dim=1)
        accuracy = Accuracy(task='binary')
        acc = accuracy(pred, y)
        self.log('accuracy', acc, on_epoch=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [24]:
pl_resnet = PLResNet(resnet)
trainer = pl.Trainer()
trainer.fit(pl_resnet, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params
-----------------------------------------------
0 | resnet    | ResNet           | 589 K 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
589 K     Trainable params
0         Non-trainable params
589 K     Total params
2.358     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [5]:
temp = pd.read_csv('data/train.csv')

In [16]:
unique_vals = pd.unique(temp['implant'])
tmp_dct = {}
for i, val in enumerate(unique_vals):
    tmp_dct[val] = i
target = temp['implant'].map(lambda val : tmp_dct[val])

In [19]:
pd.unique(target)

array([0, 1])

In [23]:
from sklearn.model_selection import ShuffleSplit

In [26]:
image_ids = temp['image_id']

In [28]:
rs = ShuffleSplit(n_splits=2, test_size=.25)

In [43]:
x, y = next(rs.split(image_ids))

13677