In [None]:
!pip install lightning

In [12]:
# Overview of data distribution

import pandas as pd
import numpy as np
import os
import pydicom

train_labels = pd.read_csv('/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv')

#-------------------------------
print('Target distribution')
n_zeros = len(train_labels[train_labels['Target']==0])
n_ones = len(train_labels[train_labels['Target']==1])
n_else = len(train_labels)-n_zeros-n_ones
print(f"Ones: {n_ones}, Zeros: {n_zeros}, else: {n_else}")

#------------------------------
print('bounding box position distribution')
bboxes = train_labels[['x','y','width','height']].dropna()
x_info = {
    'min': np.min(bboxes['x']),
    'mean': np.mean(bboxes['x']),
    'max': np.max(bboxes['x']),
    'std': np.std(bboxes['x'])
}
y_info = {
    'min': np.min(bboxes['y']),
    'mean': np.mean(bboxes['y']),
    'max': np.max(bboxes['y']),
    'std': np.std(bboxes['y'])
}
w_info = {
    'min': np.min(bboxes['width']),
    'mean': np.mean(bboxes['width']),
    'max': np.max(bboxes['width']),
    'std': np.std(bboxes['width'])
}
h_info = {
    'min': np.min(bboxes['height']),
    'mean': np.mean(bboxes['height']),
    'max': np.max(bboxes['height']),
    'std': np.std(bboxes['height'])
}
print(f"bbox x distribution - {x_info}")
print(f"bbox y distribution - {y_info}")
print(f"bbox w distribution - {w_info}")
print(f"bbox h distribution - {h_info}")

#-----------------------------
print('pixel data distribution')
patient100 = train_labels['patientId'].sample(100)
p_info = {'min':1e+10,'mean':0,'max':0,'std':0}
for pid in patient100:
    dcm_root_path = '/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images'
    dcm_path = os.path.join(dcm_root_path,f'{pid}.dcm')
    img = pydicom.read_file(dcm_path).pixel_array
    p_info['min'] = min(p_info['min'],np.min(img))
    p_info['mean'] += np.mean(img)
    p_info['max'] = max(p_info['max'],np.max(img))
    p_info['std'] += np.std(img)
    
p_info['mean'] /= 100
p_info['std'] /= 100

print(f"pixel value distribution - {p_info}")


Target distribution
Ones: 9555, Zeros: 20672, else: 0
bounding box position distribution
bbox x distribution - {'min': 2.0, 'mean': 394.04772370486654, 'max': 835.0, 'std': 204.56346707193563}
bbox y distribution - {'min': 2.0, 'mean': 366.83956043956044, 'max': 881.0, 'std': 148.93269371624748}
bbox w distribution - {'min': 40.0, 'mean': 218.4713762428048, 'max': 528.0, 'std': 59.28637267069208}
bbox h distribution - {'min': 45.0, 'mean': 329.2697017268446, 'max': 942.0, 'std': 157.7424995927731}
pixel data distribution
pixel value distribution - {'min': 0, 'mean': 126.79892159461976, 'max': 255, 'std': 57.961084378973744}


In [1]:
import lightning as L

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import functional as F


class CustomTransform(nn.Module):
    # totensor
    # random horizontal flip
    # random crop
    # normalize
    def __init__(self, size, random=True):
        self.size = size
        self.p = 0.5
        self.mean = 126
        self.std = 57
        self.random=random
        
    def forward(self, data):
        # split image and bbox label
        img, bbox = data
        width = img.shape[-1]
        # numpy to tensor
        img = torch.from_numpy(img, dtype=torch.float32)
        if self.random:
            # random horizontal flip
            if torch.rand(1) < self.p:
                img = F.hflip(img)
                bbox[0] = width-bbox[0]-bbox[2]
            # random crop (did not implement padding)
            i, j, h, w = transforms.RandomCrop.get_params(img, self.size)
            img = F.crop(img,i,j,h,w)
            bbox[0] = bbox[0] - j if bbox[0] > j else 0
            bbox[1] = bbox[1] - i if bbox[1] > i else 0
            bbox[2] = w-bbox[0]-1 if bbox[0]+bbox[2] >= w else bbox[2]
            bbox[3] = h-bbox[1]-1 if bbox[1]+bbox[3] >= h else bbox[3]
        # normalize
        img = F.normalize(img, self.mean, self.std)
        
        return img, bbox
    
    def __repr__(self):
        out = "Custom Transform to transform both the image and the bbox\n"
        out += "\ttorch.from_numpy()\n"
        if self.random:
            out += f"\tRandomHorizontalFlip(p={self.p})\n"
            out += f"\tRandomCrop({self.size}, padding=False)\n"
        out += f"\tNormalize(mean={self.mean},std={self.std})"
        return out
        
        
class CustomDataset(Dataset):
    def __init__(self, root, df, transform):
        super(MyDataset).__init__()
        self.root = root
        self.df = df
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # dtype of float32 implemented only
        # convert bbox to tensor
        bbox = torch.Tensor(row[1:5], dtype=torch.float32)
        label = torch.Tensor(row[5], dtype=torch.float32)
        
        pid = row[0]
        dcm_path = os.path.join(self.root,f'{pid}.dcm')
        img = pydicom.read_file(dcm_path).pixel_array
        
        img, bbox = self.transform((img,bbox))
        
        return pid, img, bbox, label
        
    

class PDCDataModule(L.LightningDataModule):
    def __init__(self, data_dir='./', batch_size=1,num_workers=0):
        super().__init__()
        self.data_dir = data_dir
        if isinstance(batch_size,int):
            self.tr_batch_size=batch_size
            self.val_batch_size=batch_size
        elif len(batch_size) == 2:
            self.tr_batch_size=batch_size[0]
            self.val_batch_size=batch_size[1]
        else:
            raise ValueError(batch_size) 
        self.num_workers=num_workers
        
    def prepare_data(self, fold=0, random_seed=42)
        # read full dataframe
        full_df = pd.read_csv(os.path.join(
            self.data_dir,'stage_2_train_labels.csv'))
        df_0 = full_df[full_df['Target']==0]
        df_1 = full_df[full_df['Target']==1]
        # apply undersampling to target==0
        df_00 = df_0.sample(frac=0.25,random_state=random_seed)
        df_01 = df_0.drop(df_00.index).sample(n=len(df_00),random_state=random_seed)
        df_10 = df_1.sample(frac=0.5,random_state=random_seed)
        df_11 = df_1.drop(df_10.index)
        # Train Test Split: split the dataframe
        if fold == 0:
            self.tr_df = pd.concat((df_00,df_10))
            self.val_df = pd.concat((df_01,df_11))
        elif fold == 1:
            self.tr_df = pd.concat((df_01,df_11))
            self.val_df = pd.concat((df_00,df_10))
        else:
            raise ValueError('fold should be either 0 or 1')
        
    def setup(self, size):
        self.tr_dset = CustomDataset(
            root=os.path.join(self.data_dir,'stage_2_train_images'),
            df=self.tr_df,
            transform=CustomTransform(size))
        self.val_dset = CustomDataset(
            root=os.path.join(self.data_dir,'stage_2_train_images'),
            df=self.val_df,
            transform=CustomTransform(size, random=False))
    
    def train_dataloader(self):
        return DataLoader(
                    dataset=self.tr_dset, 
                    batch_size=self.tr_batch_size, 
                    suffle=True, num_workers=self.num_workers)
    def val_dataloader(self):
        return DataLoader(
                    dataset=self.val_dset, 
                    batch_size=self.val_batch_size, 
                    shuffle=False, num_workers=self.num_workers)
    

SyntaxError: invalid syntax (2449199879.py, line 7)