In [1]:
import detectron2
from pathlib import Path
import random, cv2, os
import matplotlib.pyplot as plt
import numpy as np
import pycocotools.mask as mask_util
# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor, DefaultTrainer
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer, ColorMode
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.utils.logger import setup_logger
from detectron2.evaluation.evaluator import DatasetEvaluator
from detectron2.structures import polygons_to_bitmask
from detectron2.evaluation import inference_on_dataset, print_csv_format
from detectron2.utils import comm
from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN
from detectron2.engine import BestCheckpointer
from detectron2.checkpoint import DetectionCheckpointer
from glob import glob
from src.swin.swint.config import add_swinl_384_config
from src.evaluator import MAPIOUEvaluator
import torch

setup_logger()

<Logger detectron2 (DEBUG)>

In [2]:
dataDir=Path('LIVECell_dataset_2021/images')
cfg = get_cfg()
register_coco_instances('sartorius_train',{}, 'LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', dataDir/'livecell_train_val_images')
register_coco_instances('sartorius_val',{},'LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', dataDir/'livecell_train_val_images')
register_coco_instances('sartorius_test',{}, 'LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json', dataDir/'livecell_test_images')
metadata = MetadataCatalog.get('sartorius_train')
train_ds = DatasetCatalog.get('sartorius_train')
config_name = "lib/swin/configs/SwinT/mask_rcnn_swint_T_FPN_3x.yaml"

[32m[12/22 11:07:01 d2.data.datasets.coco]: [0mLoading LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json takes 10.29 seconds.
[32m[12/22 11:07:01 d2.data.datasets.coco]: [0mLoaded 3253 images in COCO format from LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json


In [3]:
class Trainer(DefaultTrainer):
    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        return MAPIOUEvaluator(dataset_name)
    
    def build_hooks(self):
        ret = super().build_hooks()
        ret.append(BestCheckpointer(cfg.TEST.EVAL_PERIOD,
                                    self.checkpointer,
                                    'segm/AP', 
                                    file_prefix=f'{os.path.basename(config_name).rstrip(".yaml")}_best'))
        return ret


In [7]:
cfg = get_cfg()
add_swinl_384_config(cfg)
cfg.merge_from_file(config_name)

cfg.DATASETS.TRAIN = ("sartorius_train", "sartorius_test")
cfg.DATASETS.TEST = ("sartorius_val",)


cfg.MODEL.WEIGHTS = "lib/swin/swin_large_patch4_window12_384_22kto1k_d2.pth"

cfg.SOLVER.IMS_PER_BATCH = 2
cfg.DATALOADER.NUM_WORKERS = 10
    
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 256   
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
cfg.MODEL.RETINANET.NUM_CLASSES = 1
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5
    
cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE = "ciou"
cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "ciou"

cfg.INPUT.CROP.ENABLED = True
cfg.INPUT.CROP.SIZE = [0.85, 0.97]

cfg.SOLVER.BASE_LR = 0.02
     
cfg.SOLVER.CHECKPOINT_PERIOD = 1000  
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128   
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5
cfg.TEST.EVAL_PERIOD = 1000  # Once per epoch
cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE = "ciou"
cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "ciou"
cfg.SOLVER.AMP.ENABLED = True

cfg.OUTPUT_DIR = f'swin_l_384_weights'
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

In [9]:
trainer = Trainer(cfg) # without data augmentation
trainer.resume_or_load(resume=False)
trainer.train()

[32m[12/22 11:08:45 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(192, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(384, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(1536, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): SwinTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 192, kernel_size=(4, 4), stride=(4, 4))
        (norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
      )
      (pos_drop): Dropout(p

Some model parameters or buffers are not found in the checkpoint:
[34mbackbone.bottom_up.norm0.{bias, weight}[0m
[34mbackbone.bottom_up.norm1.{bias, weight}[0m
[34mbackbone.bottom_up.norm2.{bias, weight}[0m
[34mbackbone.bottom_up.norm3.{bias, weight}[0m
[34mbackbone.fpn_lateral2.{bias, weight}[0m
[34mbackbone.fpn_lateral3.{bias, weight}[0m
[34mbackbone.fpn_lateral4.{bias, weight}[0m
[34mbackbone.fpn_lateral5.{bias, weight}[0m
[34mbackbone.fpn_output2.{bias, weight}[0m
[34mbackbone.fpn_output3.{bias, weight}[0m
[34mbackbone.fpn_output4.{bias, weight}[0m
[34mbackbone.fpn_output5.{bias, weight}[0m
[34mproposal_generator.rpn_head.anchor_deltas.{bias, weight}[0m
[34mproposal_generator.rpn_head.conv.{bias, weight}[0m
[34mproposal_generator.rpn_head.objectness_logits.{bias, weight}[0m
[34mroi_heads.box_head.0.fc1.{bias, weight}[0m
[34mroi_heads.box_head.0.fc2.{bias, weight}[0m
[34mroi_heads.box_head.1.fc1.{bias, weight}[0m
[34mroi_heads.box_head.1.fc2.{bia

[32m[12/22 11:09:11 d2.engine.train_loop]: [0mStarting training from iteration 0


  for poly in cropped:
  for poly in cropped:
  for poly in cropped:
  for poly in cropped:
  for poly in cropped:
  max_size = (max_size + (stride - 1)) // stride * stride
  for poly in cropped:
  for poly in cropped:


[32m[12/22 11:09:12 d2.utils.memory]: [0mAttempting to copy inputs of <function pairwise_iou at 0x7f2adf0bc940> to CPU due to CUDA OOM




[4m[5m[31mERROR[0m [32m[12/22 11:09:14 d2.engine.train_loop]: [0mException during training:
Traceback (most recent call last):
  File "/home/nmark/kaggle/sartorius/detectron_sartorius/venv/lib/python3.8/site-packages/detectron2/engine/train_loop.py", line 149, in train
    self.run_step()
  File "/home/nmark/kaggle/sartorius/detectron_sartorius/venv/lib/python3.8/site-packages/detectron2/engine/defaults.py", line 494, in run_step
    self._trainer.run_step()
  File "/home/nmark/kaggle/sartorius/detectron_sartorius/venv/lib/python3.8/site-packages/detectron2/engine/train_loop.py", line 395, in run_step
    loss_dict = self.model(data)
  File "/home/nmark/kaggle/sartorius/detectron_sartorius/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/nmark/kaggle/sartorius/detectron_sartorius/venv/lib/python3.8/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 154, in forward
    features = 

RuntimeError: CUDA out of memory. Tried to allocate 92.00 MiB (GPU 0; 23.70 GiB total capacity; 20.47 GiB already allocated; 94.56 MiB free; 20.94 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF