In [None]:
# all_slow

## Setup

In [1]:
#| include: false
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
from fastai.vision.all import *
from fastai.callback.tensorboard import TensorBoardCallback

import random
import cv2
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

import torch
import albumentations as alb

import segmentation_models_pytorch as smp

from steel_segmentation.utils import get_train_df
from steel_segmentation.transforms import SteelDataBlock, SteelDataLoaders
from steel_segmentation.losses import MultiClassesSoftBCEDiceLoss, LossEnabler
from steel_segmentation.metrics import ModDiceMulti
from steel_segmentation.optimizer import opt_func

In [5]:
def seed_everything(seed=69):
    """
    Seeds `random`, `os.environ["PYTHONHASHSEED"]`,
    `numpy`, `torch.cuda` and `torch.backends`.
    """
    #warnings.filterwarnings("ignore")
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()

Training parameters:

In [13]:
bs = 16
size = (224,512)
epochs = 30
lr = 3e-4
path = Path("../data") # where data dir is

## Data loading

In [14]:
df = get_train_df(path, only_faulty=True, pivot=True)
df.describe(include="all")

ClassId,1,2,3,4,n,ClassIds
count,890,245,5078,789,6578.0,6578.0
unique,890,245,5078,789,,9.0
top,29102 12 29346 24 29602 24 29858 24 30114 24 30370 24 30626 24 30882 24 31139 23 31395 23 31651 23 31907 23 32163 23 32419 23 32675 23 77918 27 78174 55 78429 60 78685 64 78941 68 79197 72 79452 77 79708 81 79964 85 80220 89 80475 94 80731 98 80987 102 81242 105 81498 105 81754 104 82010 104 82265 105 82521 31 82556 69 82779 27 82818 63 83038 22 83080 57 83297 17 83342 50 83555 13 83604 44 83814 8 83866 37 84073 3 84128 31 84390 25 84652 18 84918 8 85239 10 85476 29 85714 47 85960 57 86216 57 86471 58 86727 58 86983 58 87238 59 87494 59 87750 59 88005 60 88261 60 88517 60 88772 61 89028 53...,145658 7 145901 20 146144 33 146386 47 146629 60 146872 73 147115 86 147364 93 147620 93 147876 93 148132 93 148388 93 148644 93 148900 93 149156 93 149412 93 149668 46,18661 28 18863 82 19091 110 19347 110 19603 110 19859 110 20115 110 20371 110 20627 110 20883 110 21139 110 21395 110 21651 110 21962 55 293125 251 293381 251 293637 251 293893 251 294149 251 294405 251 294661 251 294917 251 295173 251 295429 251 295685 251 295941 251 296197 251 296453 251 296709 251 296965 251 297221 251 297477 251 297733 251 297989 251 298245 251 298564 188 298945 63,131973 1 132228 4 132483 6 132738 8 132993 11 133248 13 133503 16 133757 19 134012 22 134267 24 134522 26 134777 29 135032 31 135287 34 135542 36 135796 40 136050 43 136304 46 136558 50 136812 54 137066 56 137320 59 137574 61 137828 63 138082 65 138336 68 138590 70 138845 71 139101 71 139356 73 139612 73 139868 73 140123 74 140379 74 140634 75 140890 75 141145 77 141400 78 141654 80 141909 81 142164 82 142418 84 142673 85 142928 86 143182 88 143437 89 143692 90 143946 93 144201 94 144456 95 144710 97 144965 98 145220 99 145474 101 145729 103 145983 105 146237 107 146491 109 146745 112 1469...,,3.0
freq,1,1,1,1,,4691.0
mean,,,,,1.064457,
std,,,,,0.24682,
min,,,,,1.0,
25%,,,,,1.0,
50%,,,,,1.0,
75%,,,,,1.0,


In [15]:
def get_train_aug(height, width): 
    tfm_list = [
        alb.RandomCrop(height, width, p=1.0),
        alb.OneOf(
          [
           alb.VerticalFlip(p=0.5),
           alb.HorizontalFlip(p=0.5),
          ], p=0.5),
        alb.RandomBrightnessContrast(
            brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=0.5),
    ]
    return alb.Compose(tfm_list)

def get_valid_aug(height, width): 
    tfms = [alb.RandomCrop(height, width, p=1.0)]
    return alb.Compose(tfms)

device = "cuda" if torch.cuda.is_available() else "cpu"
train_aug = get_train_aug(*size)
valid_aug = get_valid_aug(*size)
block = SteelDataBlock(path, train_aug=train_aug, valid_aug=valid_aug)
dls = SteelDataLoaders(block, df, bs=bs, device=device)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [16]:
xb, yb = dls.one_batch()
print(xb.shape, yb.shape)

torch.Size([16, 3, 224, 512]) torch.Size([16, 4, 224, 512])


## Model

In [17]:
model = smp.Unet(encoder_name="resnet18", encoder_weights="imagenet", classes=4, activation=None)
criterion = BCEWithLogitsLossFlat(pos_weight=torch.tensor([2.0,2.0,1.0,1.5])) # pos_weight because class imbalance
#opt_func = partial(opt_func, torch_opt=torch.optim.Adam) # no need to use pytorch optim
opt_func = RAdam
model_dir = Path("../models")
metrics = [ModDiceMulti(with_logits=True)]

In [23]:
learner = Learner(
    dls = dls,
    model = model,
    loss_func = criterion,
    opt_func = opt_func,
    metrics = metrics,
    model_dir = model_dir,
    cbs = [LossEnabler]
)

  warn(f"You are shadowing an attribute ({name}) that exists in the learner. Use `self.learn.{name}` to avoid this")


In [24]:
learner.summary()

  return np.nanmean(binary_dice_scores)


Unet (Input shape: 16)
Layer (type)         Output Shape         Param #    Trainable 
                     16 x 64 x 112 x 256 
Conv2d                                    9408       True      
BatchNorm2d                               128        True      
ReLU                                                           
MaxPool2d                                                      
Conv2d                                    36864      True      
BatchNorm2d                               128        True      
ReLU                                                           
Conv2d                                    36864      True      
BatchNorm2d                               128        True      
Conv2d                                    36864      True      
BatchNorm2d                               128        True      
ReLU                                                           
Conv2d                                    36864      True      
BatchNorm2d                            

In [25]:
learner.show_training_loop()

Start Fit
   - before_fit     : [TrainEvalCallback, Recorder, ProgressCallback]
  Start Epoch Loop
     - before_epoch   : [Recorder, ProgressCallback]
    Start Train
       - before_train   : [TrainEvalCallback, Recorder, ProgressCallback]
      Start Batch Loop
         - before_batch   : []
         - after_pred     : [LossEnabler]
         - after_loss     : []
         - before_backward: []
         - before_step    : []
         - after_step     : []
         - after_cancel_batch: []
         - after_batch    : [TrainEvalCallback, Recorder, ProgressCallback]
      End Batch Loop
    End Train
     - after_cancel_train: [Recorder]
     - after_train    : [Recorder, ProgressCallback]
    Start Valid
       - before_validate: [TrainEvalCallback, Recorder, ProgressCallback]
      Start Batch Loop
         - **CBs same as train batch**: []
      End Batch Loop
    End Valid
     - after_cancel_validate: [Recorder]
     - after_validate : [Recorder, ProgressCallback]
  End Epoch Loop


Logging with the TensorBoardCallback:

In [27]:
# logging info
log_dir = Path("../logs") / f"unet_resnet_bce_epochs{epochs}_lr{lr}"
log_dir

Path('../logs/unet_resnet_bce_epochs30_lr0.0003')

In [29]:
train_cbs = [
    TensorBoardCallback(log_dir=log_dir, log_preds=True, trace_model=True, projector=False),
    GradientAccumulation(n_acc=24),
    SaveModelCallback(monitor="valid_loss", fname=log_dir.name, with_opt=True),
]

In [32]:
learner.fit(epochs, lr=lr, cbs=train_cbs)

epoch,train_loss,valid_loss,mod_dice_multi,time
0,0.051801,0.040153,0.527716,04:04
