In [2]:
import os 

In [3]:
os.chdir('../')

In [4]:
from pydicom import dcmread
import pandas as pd 
import numpy as np
from processing import MammographyPreprocessor
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
from torchmetrics import Accuracy

In [7]:
# getting a list of all the training data folders 
training_data_paths = [f.path for f in os.scandir('data/train_images') if f.is_dir()]
train_csv = pd.read_csv('data/train.csv')
test_csv = pd.read_csv('data/test.csv')

In [8]:
train_csv

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,0,C,49,False
54702,1,9989,63473691,L,MLO,60.0,0,0,0,,0,C,216,False
54703,1,9989,1078943060,L,CC,60.0,0,0,0,,0,C,216,False
54704,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,0,C,216,True


In [9]:
train_csv = pd.read_csv('data/train.csv')

In [10]:
class XRayDataset(Dataset):
    def __init__(self, base_dir, image_ids):
        self.base_dir = base_dir
        self.image_ids = image_ids
        train_csv = pd.read_csv(f'{base_dir}/train.csv')
        train_csv.index = train_csv['image_id']
        self.df = train_csv
        
        
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, i):
        image_id = self.image_ids[i]
        patient_id = self.df.loc[image_id]['patient_id']
        xray = Image.open(f'{self.base_dir}/train_images/{patient_id}/{image_id}.png')
        label = self.df.loc[image_id]['cancer']
        return torch.tensor(np.array(xray)/255, dtype=torch.float)[None, :], torch.tensor(label, dtype=torch.long)

In [11]:
train_dataset = XRayDataset('data', train_csv['image_id'])

In [12]:
X = train_csv['image_id']
y = train_csv['cancer']
from sklearn.model_selection import train_test_split
X_train_total, X_test, y_train_total, y_tetst = train_test_split(X, y, test_size=.25)
X_train, X_val, y_train, y_val = train_test_split(X_train_total, y_train_total, test_size=.25)

In [13]:
trainset = XRayDataset('data', list(X_train.values))
testset = XRayDataset('data', list(X_test.values))
valset = XRayDataset('data', list(X_val.values))
bs = 64
train_loader, test_loader = DataLoader(trainset, batch_size=64), DataLoader(testset, batch_size=64)
val_loader = DataLoader(valset, batch_size=bs)

In [14]:
sample = next(iter(train_loader))

In [15]:
sample[0][0].shape

torch.Size([1, 256, 128])

In [16]:
from models import ResNet

In [17]:
resnet = ResNet(depth=56, block_name='BottleNeck', num_classes=2, device='cpu')

In [None]:
import pytorch_lightning as pl 
import torch.nn as nn
import torch 

In [18]:
class PLResNet(pl.LightningModule):
    def __init__(self, resnet):
        super().__init__()
        self.resnet = resnet
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self, x):
        return self.resnet(x)
        
    def training_step(self, batch, batch_idx):
        x, y = batch 
        logits = self.resnet(x)
        loss = self.criterion(logits, y)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch 
        logits = self(x)
        loss = self.criterion(logits, y)
        pred = torch.argmax(logits, dim=1)
        accuracy = Accuracy(task='binary')
        acc = accuracy(pred, y)
        self.log('accuracy', acc, on_epoch=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [19]:
pl_resnet = PLResNet(resnet)
trainer = pl.Trainer()
trainer.fit(pl_resnet, train_loader, val_loader)

In [20]:
temp = pd.read_csv('data/train.csv')

In [21]:
unique_vals = pd.unique(temp['implant'])
tmp_dct = {}
for i, val in enumerate(unique_vals):
    tmp_dct[val] = i
target = temp['implant'].map(lambda val : tmp_dct[val])

In [22]:
pd.unique(target)

array([0, 1])

In [23]:
from sklearn.model_selection import ShuffleSplit

In [24]:
image_ids = temp['image_id']

In [25]:
rs = ShuffleSplit(n_splits=2, test_size=.25)

In [26]:
x, y = next(rs.split(image_ids))


In [35]:
train_csv.index = train_csv['image_id']


In [28]:
target_col = 'cancer'
target_val = 1
tmp = train_csv[train_csv[target_col].isin([target_val])]
total_ids = tmp['image_id']

In [30]:
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=1, test_size=.2)
neg_train, neg_test = next(rs.split(tmp['image_id']))

In [34]:
tmp.iloc[neg_train]

Unnamed: 0_level_0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
138972646,1,21923,138972646,R,MLO,64.0,1,1,1,0.0,0,B,49,False
159974570,1,1963,159974570,L,MLO,67.0,1,1,1,0.0,0,B,49,False
1878879697,1,26700,1878879697,R,MLO,54.0,1,1,1,0.0,0,D,49,False
1926447510,1,11094,1926447510,L,CC,74.0,1,1,1,0.0,0,A,49,False
1749776218,1,19486,1749776218,L,CC,75.0,1,1,1,0.0,0,B,49,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2026358482,1,12305,2026358482,L,CC,43.0,1,1,0,0.0,0,A,49,False
1075848024,2,60617,1075848024,L,MLO,68.0,1,1,0,,0,,21,False
146985323,1,4953,146985323,R,CC,65.0,1,1,0,0.0,0,B,49,False
1510269247,2,8403,1510269247,L,MLO,68.0,1,1,0,,0,,21,False


In [84]:
to_mimic = [('invasive', [1]), ('density', ['A', 'B']), ('cancer', [1])]
tmp = train_csv
for col_name, val in to_mimic: 
    tmp = tmp[tmp[col_name].isin(val)]

In [85]:
tmp

Unnamed: 0_level_0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
388811999,1,10130,388811999,L,MLO,71.0,1,1,1,0.0,0,B,49,False
613462606,1,10130,613462606,L,CC,71.0,1,1,1,0.0,0,B,49,False
1360338805,1,10130,1360338805,L,CC,71.0,1,1,1,0.0,0,B,49,False
1672636630,1,10130,1672636630,L,MLO,71.0,1,1,1,0.0,0,B,49,False
195400299,1,10589,195400299,L,MLO,74.0,1,1,1,0.0,0,B,170,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302179668,1,9029,302179668,R,MLO,83.0,1,1,1,0.0,0,B,49,False
1906118149,1,9029,1906118149,R,CC,83.0,1,1,1,0.0,0,B,49,False
203061242,1,9559,203061242,L,CC,76.0,1,1,1,0.0,0,B,49,False
1047452753,1,9559,1047452753,L,MLO,76.0,1,1,1,0.0,0,B,49,False
