
# Assignment 3

This is a template notebook for Assignment 3.


## Install dependencies and initialization

In [None]:
import os
p = os.getenv('PATH')
ld = os.getenv('LD_LIBRARY_PATH')
os.environ['PATH'] = f"/usr/local/cuda-11.1/bin:{p}"
os.environ['LD_LIBRARY_PATH'] = f"/usr/local/cuda-11.1/lib64:{ld}"
# change pytorch to 1.9.0 compiled with cuda 11.1
!pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html


# install dependencies: 
!pip install pyyaml==5.1 pycocotools>=2.0.1
# !pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.6/index.html
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.9/index.html

!pip install opencv-python
!pip install h5py
!pip install ipywidgets
!pip install -U albumentations --no-binary qudida,albumentations
!pip install setuptools==59.5.0

In [None]:
!pwd # shows current directory
!ls  # shows all files in this directory
!nvidia-smi # shows the specs and the current status of the allocated GPU

In [None]:
# import some common libraries
# Since I use an external computer and not colab
# from google.colab.patches import cv2_imshow
import cv2
from sklearn.metrics import jaccard_score
from PIL import Image, ImageDraw
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import datetime
import random
import json
import cv2
import csv
import os

# import some common pytorch utilities
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn
import torch

# import some common detectron2 utilities
import detectron2
from detectron2 import model_zoo
from detectron2.config import get_cfg
from detectron2.structures import BoxMode
from detectron2.engine import DefaultTrainer
from detectron2.engine import DefaultPredictor
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import ColorMode
from detectron2.utils.visualizer import Visualizer
from detectron2.data import build_detection_test_loader
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
setup_logger(output="output")

In [None]:
# Make sure that GPU is available for your notebook. 
# Otherwise, you need to update the settungs in Runtime -> Change runtime type -> Hardware accelerator
torch.cuda.is_available()

In [None]:
# You need to mount your google drive in order to load the data:
# from google.colab import drive
# drive.mount('/content/drive')
# Put all the corresponding data files in a data folder and put the data folder in a same directory with this notebook.
# Also create an output directory for your files such as the trained models and the output images.

In [None]:
# Define the location of current directory, which should contain data/train, data/test, and data/train.json.
# TODO: approx 1 line
# BASE_DIR = '/content/drive/My Drive/Colab Notebooks/03-cnn-detection-segmentation'
BASE_DIR = '.'
OUTPUT_DIR = '{}/output'.format(BASE_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# import h5py
# data_dirs = '{}/data'.format(BASE_DIR)

# train_dir = os.path.join(data_dirs, 'train')
# test_dir = os.path.join(data_dirs, 'test')

# train_images = os.listdir(train_dir)
# test_images = os.listdir(test_dir)

# fileName = 'data.h5'
# with h5py.File(fileName, "w") as out:
#     train = out.create_group("train")
#     for image in train_images:
#         file_name = os.path.join(data_dirs, "train", image)
#         img = cv2.imread(file_name)
#         train.create_dataset(
#             name = image,
#             data = img,
#             shape = img.shape,
#             maxshape = img.shape,
#             dtype="u1",
#             compression="gzip"
#         )
#     test = out.create_group("test")
#     for image in test_images:
#         file_name = os.path.join(data_dirs, "test", image)
#         img = cv2.imread(file_name)
#         test.create_dataset(
#             name = image,
#             data = img,
#             shape = img.shape,
#             maxshape = img.shape,
#             dtype="u1",
#             compression="gzip"
#         )
# with h5py.File('data.h5', "r") as read:
#     print(type(read['train']["P0167.png"]))

## Part 1: Object Detection

### Data Loader

In [None]:
from torch.utils import data

data_dirs = '{}/data'.format(BASE_DIR)
TRAIN_IMAGES = os.listdir('{}/{}'.format(data_dirs, "train"))
TRAIN_IMAGES = list(filter(lambda x: x.endswith(".png") or x.endswith(".jpeg") or x.endswith(".jpg"), TRAIN_IMAGES))
total_len = len(TRAIN_IMAGES)
train_len = int(0.80 * total_len)
val_len = total_len - train_len
TRAIN_IMAGES_SPLIT, VAL_IMAGES_SPLIT = data.random_split(TRAIN_IMAGES, [train_len, val_len])
TEST_IMAGES = os.listdir('{}/{}'.format(data_dirs, "test"))
TEST_IMAGES = list(filter(lambda x: x.endswith(".png") or x.endswith(".jpeg") or x.endswith(".jpg"), TEST_IMAGES))

In [None]:
'''
# This function should return a list of data samples in which each sample is a dictionary. 
# Make sure to select the correct bbox_mode for the data
# For the test data, you only have access to the images, therefore, the annotations should be empty.
# Other values could be obtained from the image files.
# TODO: approx 35 lines
'''

def get_detection_data(set_name):
  data_dirs = '{}/data'.format(BASE_DIR)

  train_json = json.load(open('{}/train.json'.format(data_dirs)))
  train_records = {}
  for train_record in train_json:
    image_anns = train_records.get(train_record['file_name'], [])
    image_anns.append(train_record)
    train_records[train_record['file_name']] = image_anns
    
  image_dir = '{}/{}'.format(data_dirs, set_name)
  if(set_name == 'val'):
    image_dir = '{}/{}'.format(data_dirs, 'train')
  
  if(set_name == 'train'):
    IMAGES_LIST = TRAIN_IMAGES_SPLIT
  if(set_name == 'val'):
    IMAGES_LIST = VAL_IMAGES_SPLIT
  if(set_name == 'test'):
    IMAGES_LIST = TEST_IMAGES

  dataset = []
  
  for idx, image_name in enumerate(IMAGES_LIST):
    record = {}
    file_name = '{}/{}'.format(image_dir, image_name)
    if(not (file_name.endswith(".png") or file_name.endswith(".jpg") or file_name.endswith(".jpeg"))):
        continue
    height, width = cv2.imread(file_name).shape[:2]
    record['file_name'] = file_name
    record['height'] = height
    record['width'] = width
    record['image_id'] = idx
    annotations = []
    if(set_name == 'train' or set_name == 'val'):
      
      for train_record in train_records.get(image_name, []):
        annotation = {
            "bbox": train_record['bbox'],
            "segmentation": train_record['segmentation'],
            "bbox_mode": BoxMode.XYWH_ABS,
            "category_id": 0
        }
        annotations.append(annotation)
    record['annotations'] = annotations
    dataset.append(record)
  return dataset

In [None]:
'''
# Remember to add your dataset to DatasetCatalog and MetadataCatalog
# Consdier "data_detection_train" and "data_detection_test" for registration
# You can also add an optional "data_detection_val" for your validation by spliting the training data
# TODO: approx 5 lines
'''

DatasetCatalog.clear()
DatasetCatalog.register("data_detection_train", lambda d="train": get_detection_data(d))
MetadataCatalog.get("data_detection_train").set(thing_classes=["plane"])

DatasetCatalog.register("data_detection_val", lambda d="val": get_detection_data(d))
MetadataCatalog.get("data_detection_val").set(thing_classes=["plane"])

DatasetCatalog.register("data_detection_test", lambda d="test": get_detection_data(d))
MetadataCatalog.get("data_detection_test").set(thing_classes=["plane"])
plane_metadata = MetadataCatalog.get("data_detection_train")

In [None]:
'''
# Visualize some samples using Visualizer to make sure that the function works correctly
# TODO: approx 5 lines
'''
import matplotlib.pyplot as plt


# as opencv loads in BGR format by default, we want to show it in RGB.

dataset_dicts = get_detection_data("train")
for d in random.sample(dataset_dicts, 3):
    print(d["file_name"])
    img = cv2.imread(d["file_name"])
    visualizer = Visualizer(img[:, :, ::-1], metadata=plane_metadata, scale=0.5)
    out = visualizer.draw_dataset_dict(d)
    plt.figure(figsize=(5,5))
    plt.imshow(cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_BGR2RGB), aspect="auto")
    plt.show()
    # cv2.imwrite(os.path.basename(d["file_name"]), out.get_image()[:, :, ::-1])


In [None]:
import matplotlib.pyplot as plt


# as opencv loads in BGR format by default, we want to show it in RGB.

dataset_dicts = get_detection_data("val")
for d in random.sample(dataset_dicts, 3):
    print(d["file_name"])
    img = cv2.imread(d["file_name"])
    visualizer = Visualizer(img[:, :, ::-1], metadata=plane_metadata, scale=0.5)
    out = visualizer.draw_dataset_dict(d)
    plt.figure(figsize=(5,5))
    plt.imshow(cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_BGR2RGB), aspect="auto")
    plt.show()
    # cv2.imwrite(os.path.basename(d["file_name"]), out.get_image()[:, :, ::-1])

### Set Configs

In [None]:
'''
# Set the configs for the detection part in here.
# TODO: approx 15 lines
'''
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("data_detection_train",)
cfg.DATASETS.TEST = ("data_detection_val",)
cfg.DATALOADER.NUM_WORKERS = 1
cfg.SOLVER.IMS_PER_BATCH = 2  
cfg.SOLVER.BASE_LR = 0.00025  
cfg.SOLVER.MAX_ITER = 500    
cfg.SOLVER.STEPS = []        
# cfg.MODEL.DEVICE="cuda"
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml")
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512   # The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
cfg.TEST.EVAL_PERIOD = 100

cfg.OUTPUT_DIR = "{}/output/detection_baseline/".format(BASE_DIR)
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)



In [None]:
from detectron2.data import build_detection_train_loader, detection_utils as utils, transforms as T

import h5py
import torch
import copy

read = h5py.File("data.h5", "r")
train = read['train']
  

def custom_mapper(dataset_dict):
    dataset_dict = copy.deepcopy(dataset_dict)
    file_name = os.path.basename(dataset_dict["file_name"])
    image = train[file_name][()]
    
    height, width, channel = image.shape
    
    auginput = T.AugInput(image)
    transform = T.ResizeShortestEdge(
            cfg.INPUT.MIN_SIZE_TRAIN, cfg.INPUT.MAX_SIZE_TRAIN, cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
        )(auginput)
    image = torch.from_numpy(auginput.image.transpose(2, 0, 1))
    annos = [
        utils.transform_instance_annotations(annotation, [transform], image.shape[1:])
        for annotation in dataset_dict.pop("annotations")
    ]
    
    return {
        "image": image,
        "instances": utils.annotations_to_instances(annos, image.shape[1:]),
        "width": width,
        "height": height,
    }

class PlaneDetectionTrainer(DefaultTrainer):
    
    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = cfg.OUTPUT_DIR
            os.makedirs(output_folder, exist_ok=True)
        return COCOEvaluator("data_detection_val", output_dir=output_folder)



In [None]:
import time
start = time.process_time()

### Training

In [None]:
import torch
torch.cuda.empty_cache()

trainer = PlaneDetectionTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

In [None]:
print(time.process_time() - start)

### Evaluation and Visualization

In [None]:
%load_ext tensorboard
%tensorboard --logdir output/detection_baseline

In [None]:
'''
# After training the model, you need to update cfg.MODEL.WEIGHTS
# Define a DefaultPredictor
'''
final_model_path = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.WEIGHTS = final_model_path
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6
region_predictor = DefaultPredictor(cfg)


In [None]:
'''
# Visualize the output for 3 random test samples
# TODO: approx 10 lines
'''
from detectron2.utils.visualizer import ColorMode
import matplotlib.pyplot as plt

dataset_dicts = get_detection_data("test")
SAMPLE_IMAGES_DIR = f"{OUTPUT_DIR}/detection_test_images_sample"
os.makedirs(SAMPLE_IMAGES_DIR, exist_ok=True)
for d in random.sample(dataset_dicts, 3):    
    im = cv2.imread(d["file_name"])
    outputs = region_predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    v = Visualizer(im[:, :, ::-1],
                   metadata=plane_metadata, 
                   scale=0.5, 
                   instance_mode=ColorMode.IMAGE_BW   # remove the colors of unsegmented pixels. This option is only available for segmentation models
    )
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    # plt.figure(figsize=(5,5))
    plt.imshow(cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_BGR2RGB), aspect="auto")
    plt.show()
    cv2.imwrite(f"{SAMPLE_IMAGES_DIR}/{os.path.basename(d['file_name'])}", out.get_image()[:, :, ::-1])

In [None]:
'''
# Use COCOEvaluator and build_detection_train_loader
# You can save the output predictions using inference_on_dataset
# TODO: approx 5 lines
'''
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader

evaluator = COCOEvaluator("data_detection_val", output_dir=OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "data_detection_val")
inference_on_dataset(region_predictor.model, val_loader, evaluator)


### Improvements



In [None]:
import torch

from detectron2.data import DatasetMapper, build_detection_train_loader, transforms as T

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("data_detection_train",)
cfg.DATASETS.TEST = ("data_detection_val",)
cfg.DATALOADER.NUM_WORKERS = 1
cfg.SOLVER.IMS_PER_BATCH = 2  
cfg.SOLVER.BASE_LR = 0.0025  
cfg.SOLVER.MAX_ITER = 3000    
cfg.SOLVER.STEPS = [1000, 2000]     
cfg.SOLVER.CHECKPOINT_PERIOD = 100
# cfg.MODEL.DEVICE="cuda"
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")

# Default seems to work better
# cfg.MODEL.PIXEL_MEAN = MEAN.tolist()
cfg.MODEL.PIXEL_STD = [57.375, 57.120, 58.395] #ImageNet std as mentioned in detectron2 docs
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512   # The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
cfg.TEST.EVAL_PERIOD = 100
cfg.OUTPUT_DIR = "{}/output/final_detection".format(BASE_DIR)
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

torch.cuda.empty_cache()

class PlaneDetectionTrainer(DefaultTrainer):
    
    @classmethod
    def build_train_loader(cls, cfg):
        return build_detection_train_loader(cfg,
                mapper = DatasetMapper(cfg, is_train=True, augmentations=[
                    T.RandomCrop(
                        crop_type="absolute_range",
                        crop_size=(512, 1024)
                    ),
                    T.RandomBrightness(0.9, 1.2),
                    T.RandomFlip(horizontal=True, vertical=False),
                    T.RandomFlip(horizontal=False, vertical=True)
                ]))
    
    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = cfg.OUTPUT_DIR
            os.makedirs(output_folder, exist_ok=True)
        return COCOEvaluator("data_detection_val", output_dir=output_folder)


In [None]:
trainer = PlaneDetectionTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

In [None]:
%load_ext tensorboard
%tensorboard --logdir output/final_detection

In [None]:
'''
# After training the model, you need to update cfg.MODEL.WEIGHTS
# Define a DefaultPredictor
'''
final_model_path = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.WEIGHTS = final_model_path
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6
region_predictor = DefaultPredictor(cfg)

In [None]:
'''
# Visualize the output for 3 random test samples
# TODO: approx 10 lines
'''
from detectron2.utils.visualizer import ColorMode
import matplotlib.pyplot as plt

dataset_dicts = get_detection_data("test")
for d in random.sample(dataset_dicts, 3): 
    print(d["file_name"])
    im = cv2.imread(d["file_name"])
    outputs = region_predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    v = Visualizer(im[:, :, ::-1],
                   metadata=plane_metadata, 
                   scale=0.5, 
                   instance_mode=ColorMode.IMAGE_BW   # remove the colors of unsegmented pixels. This option is only available for segmentation models
    )
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    # plt.figure(figsize=(5,5))
    plt.imshow(cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_BGR2RGB), aspect="auto")
    plt.show()


In [None]:
'''
# Use COCOEvaluator and build_detection_train_loader
# You can save the output predictions using inference_on_dataset
# TODO: approx 5 lines
'''
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader

evaluator = COCOEvaluator("data_detection_val", output_dir=OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "data_detection_val")
inference_on_dataset(region_predictor.model, val_loader, evaluator)

In [None]:
# https://github.com/Mr-TalhaIlyas/EMPatches
# https://github.com/dovahcrow/patchify.py


def get_bbox_iou(bbox1, bbox2):
    
    a1, b1, w1, h1 = bbox1
    a2 = a1 + w1
    b2 = b1 + h1
    
    x1, y1, w2, h2 = bbox2
    x2 = x1 + w2
    y2 = y1 + h2
    
    # Boxes 1 - a1, b1, a2, b2
    # Boxes 2 - x1, y1, x2, y2
    
    # Intersection corners
    intersection_x1 = __builtins__.max(a1, x1)
    intersection_y1 = __builtins__.max(b1, y1)
    intersection_x2 = __builtins__.min(a2, x2)
    intersection_y2 = __builtins__.min(b2, y2)
    
    intersection_w = __builtins__.max(0, intersection_x2-intersection_x1)
    intersection_h = __builtins__.max(0, intersection_y2-intersection_y1)
    
    intersection = intersection_w * intersection_h
    union = (h1 * w1) + (h2 * w2) - intersection
    
    return (intersection / union)

# Supressing Overlapping bounding boxes using NMS
def supress_bounding_boxes(pred_boxes, confidence_scores):
    final_preds = []
    final_scores = []
    order = np.argsort(confidence_scores)[::-1]
    for o in order:
        bbox = pred_boxes[o]
        score = confidence_scores[o]
        if(len(final_preds) == 0):
            final_preds.append(bbox)
            continue    
        insert = True
        for pred in final_preds:
            pred_coco = [pred[0], pred[1], pred[2] - pred[0], pred[3] - pred[1]]
            bbox_coco = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
            if(get_bbox_iou(pred_coco, bbox_coco) > 0.50):
                insert = False
                break
        if(insert==True):
            final_preds.append(bbox)
            final_scores.append(score)
    return final_preds, final_scores

def calculate_patch_bboxes(
    image_height,
    image_width,
    patch_size = (800, 800),
) :
    patch_height, patch_width = patch_size
    patch_bboxes = []
    y_max = y_min = 0
    y_overlap = 0
    x_overlap = 0
    while y_max < image_height:
        x_min = x_max = 0
        y_max = y_min + patch_height
        while x_max < image_width:
            x_max = x_min + patch_width
            if y_max > image_height or x_max > image_width:
                xmax = min(image_width, x_max)
                ymax = min(image_height, y_max)
                xmin = __builtins__.max(0, xmax - patch_width)
                ymin = __builtins__.max(0, ymax - patch_height)
                patch_bboxes.append([xmin, ymin, xmax, ymax])
            else:
                patch_bboxes.append([x_min, y_min, x_max, y_max])
            x_min = x_max - x_overlap
        y_min = y_max - y_overlap
    return patch_bboxes


#### Evaluating patches

In [None]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

import time

gt = "./output/final_detection/data_detection_val_coco_format.json"

gt_data = json.load(open(gt))
results = []
start = time.process_time()
for file in gt_data["images"]:
    img = cv2.imread(file["file_name"])
    all_predictions = []
    all_scores = []

    image_height, image_width = img.shape[0:2]
    patch_coordinates = calculate_patch_bboxes(image_height, image_width, (800, 800))
    for p in patch_coordinates:
        patch = img[p[2]: p[3], p[0]: p[1], :]
        output = region_predictor(patch)

        pred_boxes = output["instances"].pred_boxes
        confidence_scores = output["instances"].scores
        for patch_box, confidence_score in zip(pred_boxes, confidence_scores):
            patch_box = patch_box.tolist()
            patch_box = [patch_box[0] + p[0], patch_box[1] + p[2], patch_box[2] + p[0], patch_box[3] + p[2]]

            all_predictions.append(patch_box)
            all_scores.append(confidence_score.item())
    pred_boxes, scores = supress_bounding_boxes(all_predictions, all_scores)
    for id, (box, score) in enumerate(zip(pred_boxes, scores)):
        box[2] = box[2]-box[0]
        box[3] = box[3]-box[1]
        results.append({
            "id": id,
            "image_id": file["id"],
            "bbox": box,
            "category_id": 0,
            "score": score
        })
        
gt_ = COCO(gt)
dt_ = gt_.loadRes(results)
cocoEval = COCOeval(gt_,dt_,"bbox")
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()


In [None]:
'''
# Visualize the output for 3 random test samples
# TODO: approx 10 lines
'''
from detectron2.utils.visualizer import ColorMode
import matplotlib.pyplot as plt

dataset_dicts = get_detection_data("test")
SAMPLE_IMAGES_DIR = f"{OUTPUT_DIR}/detection_test_images_sample"
os.makedirs(SAMPLE_IMAGES_DIR, exist_ok=True)
for d in random.sample(dataset_dicts, 3):    
    # im = cv2.imread("./data/test/P1390.png")
    img = cv2.imread(d["file_name"])

    all_predictions = []
    all_scores = []

    image_height, image_width = img.shape[0:2]
    patch_coordinates = calculate_patch_bboxes(image_height, image_width, (800, 800))
    for p in patch_coordinates:
        patch = img[p[2]: p[3], p[0]: p[1], :]
        output = region_predictor(patch)

        pred_boxes = output["instances"].pred_boxes
        confidence_scores = output["instances"].scores
        for patch_box, confidence_score in zip(pred_boxes, confidence_scores):
            patch_box = patch_box.tolist()
            patch_box = [patch_box[0] + p[0], patch_box[1] + p[2], patch_box[2] + p[0], patch_box[3] + p[2]]

            all_predictions.append(patch_box)
            all_scores.append(confidence_score.item())

    visualizer = Visualizer(img[:, :, ::-1], metadata=plane_metadata, scale=0.5, instance_mode=ColorMode.IMAGE_BW)

    pred_boxes, _ = supress_bounding_boxes(all_predictions, all_scores)
    for box in pred_boxes:
        out = visualizer.draw_box(box)
    plt.figure(figsize=(10,10))
    plt.imshow(cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_BGR2RGB), aspect="auto")

## Part 2: Semantic Segmentation

### Data Loader

In [None]:
'''
# Write a function that returns the cropped image and corresponding mask regarding the target bounding box
# idx is the index of the target bbox in the data
# high-resolution image could be passed or could be load from data['file_name']
# You can use the mask attribute of detectron2.utils.visualizer.GenericMask 
#     to convert the segmentation annotations to binary masks
# TODO: approx 10 lines
'''
from detectron2.utils.visualizer import GenericMask

def get_instance_sample(img, bbox, seg):
  
  x1, y1, w, h = bbox

  height, width = img.shape[0:2]
  binary_mask = GenericMask(seg, height, width).mask
  ## TODO: Crop properly
  x1 = round(x1)
  x2 = x1 + round(w)
  y1 = round(y1)
  y2 = y1 + round(h)
  obj_img = img[y1:y2, x1:x2]
  obj_mask = binary_mask[y1:y2, x1:x2]
  # print(obj_img.shape)  
  return obj_img, obj_mask

def get_transforms(mean=None, std=None):
  img_transforms = transforms.Compose([
          transforms.ToTensor(), # Converting the image to tensor and change the image format (Channels-Last => Channels-First)
      ])
  return img_transforms

In [None]:
train_set = DatasetCatalog.get("data_detection_train")
means = []
vars = []
n = 0
for train in train_set:
    im = cv2.imread(train["file_name"])
    im = im.astype(float)/255
    h, w, c = im.shape
    mean = im.mean(axis=(0,1))
    std = im.std(axis=(0,1))
    means.append(mean)
MEAN = np.mean(means, axis=0)
variances = []
for train in train_set:
    im = cv2.imread(train["file_name"])
    im = im.astype(float)/255
    var = np.mean((im - MEAN) ** 2, axis = (0,1))
    variances.append(var)
STD = np.sqrt(np.mean(variances, axis=0))

In [None]:
'''
# We have provided a template data loader for your segmentation training
# You need to complete the __getitem__() function before running the code
# You may also need to add data augmentation or normalization in here
'''
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms.functional as F

# import h5py
# import os

# read = h5py.File('data.h5', "r")

class PlaneDataset(Dataset):
  def __init__(self, set_name, data_list):
      # self.transforms = get_transforms()
      self.set_name = set_name
      self.data = data_list
      self.instance_map = []
      self.images = {}
      for i, d in enumerate(self.data):
        file_name = d["file_name"]
        if(self.images.get(file_name) == None):
            img = cv2.imread(file_name)
            self.images[file_name] = img
        for annotation in d['annotations']:
          bbox = annotation["bbox"]
          seg = annotation["segmentation"]
          self.instance_map.append([file_name, bbox,seg])

  '''
  # you can change the value of length to a small number like 10 for debugging of your training procedure and overfeating
  # make sure to use the correct length for the final training
  '''
  def __len__(self):
      return len(self.instance_map)

  def numpy_to_tensor(self, img, mask):
    img = F.to_tensor(img)
    mask = torch.tensor(mask, dtype=torch.float)
    # Adding dummy color dimension for transforms
    mask = torch.unsqueeze(mask, 0)
    
    # Applying transforms functionally to ensure same values to img and mask
    hflip_percent = random.random()
    vflip_percent = random.random()
    # persp_percent = random.random()
    rotation_angle = random.choice([-180, -90, 0, 90, 180])
    
    if(hflip_percent < 0.5):
        img = F.hflip(img)
        mask = F.hflip(mask)
        
    if(vflip_percent < 0.5):
        img = F.vflip(img)
        mask = F.vflip(mask)
    
    if(rotation_angle != 0):
        img = F.rotate(img, rotation_angle)
        mask = F.rotate(mask, rotation_angle)
    
    # if(persp_percent < 0.2):
    #     F.perspective()
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    img = F.normalize(img, mean = MEAN, std=STD)
    mask = torch.squeeze(mask, 0)
        
    return img, mask

  '''
  # Complete this part by using get_instance_sample function
  # make sure to resize the img and mask to a fixed size (for example 128*128)
  # you can use "interpolate" function of pytorch or "numpy.resize"
  # TODO: 5 lines
  '''
  def __getitem__(self, idx):
    if torch.is_tensor(idx):
        idx = idx.tolist()
    file_name, bbox, seg = self.instance_map[idx]
    img = self.images[file_name]

    # img = read["train"][file_name][()]
    obj_img, obj_mask = get_instance_sample(img, bbox, seg)
    img = cv2.resize(obj_img, (128, 128), interpolation=cv2.INTER_NEAREST)
    mask = cv2.resize(obj_mask, (128, 128), interpolation=cv2.INTER_NEAREST)
    img, mask = self.numpy_to_tensor(img, mask)
    return img, mask

def get_plane_dataset(set_name='train'):
    my_data_list = DatasetCatalog.get("data_detection_{}".format(set_name))
    dataset = PlaneDataset(set_name, my_data_list)
    return dataset
        
    
    # return loader, dataset

### Network

In [None]:
# import torch.nn.functional as F
import torch.nn as nn
import torch

'''
# convolution module as a template layer consists of conv2d layer, batch normalization, and relu activation
'''
class conv(nn.Module):
    def __init__(self, in_ch, out_ch, activation=True):
        super(conv, self).__init__()
        if(activation):
          self.layer = nn.Sequential(
             nn.Conv2d(in_ch, out_ch, 3, padding=1),
             nn.BatchNorm2d(out_ch),
             nn.ReLU(inplace=True)
          )
        else:
          self.layer = nn.Sequential(
             nn.Conv2d(in_ch, out_ch, 3, padding=1)  
             )

    def forward(self, x):
        x = self.layer(x)
        return x

'''
# downsampling module equal to a conv module followed by a max-pool layer
'''
class down(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(down, self).__init__()
        self.layer = nn.Sequential(
            conv(in_ch, out_ch),
            nn.MaxPool2d(2)
            )

    def forward(self, x):
        x = self.layer(x)
        return x

'''
# upsampling module equal to a upsample function followed by a conv module
'''
class up(nn.Module):
    def __init__(self, in_ch, out_ch, bilinear=False):
        super(up, self).__init__()
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        else:
            self.up = nn.ConvTranspose2d(in_ch, in_ch, 2, stride=2)

        self.conv = conv(in_ch, out_ch)

    def forward(self, x):
        y = self.up(x)
        y = self.conv(y)
        return y

'''
# the main model which you need to complete by using above modules.
# you can also modify the above modules in order to improve your results.
'''
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        
        self.drop_out = nn.Dropout(p=0.2)
        
        # Encoder
        
        
        self.input_conv1 = conv(3, 32)
        self.input_conv2 = conv(32, 32)
        self.input_conv3 = conv(32, 64)
        self.input_conv4 = conv(64, 64)
        self.input_conv5 = conv(64, 128)
        self.input_conv6 = conv(128, 128)
        self.input_conv7 = conv(128, 256)
        self.input_conv8 = conv(256, 256)
        self.input_conv9 = conv(256, 512)
        self.input_conv10 = conv(512, 512)
        
        self.pool = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Dropout(p=0.2)
        )
        
        self.down = down(4, 8)
        
        # Decoder
        
        self.up = nn.Sequential(
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
            nn.Dropout(0.2)
        )
        self.output_conv1 = conv(512+256, 256)
        self.output_conv2 = conv(256, 256)
        self.output_conv3 = conv(256+128, 128)
        self.output_conv4 = conv(128, 128)
        self.output_conv5 = conv(128+64, 64)
        self.output_conv6 = conv(64, 64)
        self.output_conv7 = conv(64+32, 32)
        self.output_conv8 = conv(32, 32)
        
        # Final layer
        
        self.final_conv = nn.Conv2d(32, 1, 1) # ReLu activation is removed to keep the logits for the loss function
        

    def forward(self, input):
      # Encoder
      y = self.input_conv1(input)
      y = self.input_conv2(y)
      concat_32 = y.clone().detach()
      
      y = self.input_conv3(self.pool(y))
      y = self.input_conv4(y)
      concat_64 = y.clone().detach()
                    
      y = self.input_conv5(self.pool(y))
      y = self.input_conv6(y)
      concat_128 = y.clone().detach()
                    
      y = self.input_conv7(self.pool(y))
      y = self.input_conv8(y)
      concat_256 = y.clone().detach()  
                    
      y = self.input_conv9(self.pool(y))
      y = self.input_conv10(y)
        
        
      # Decoder              
      
      y = self.up(y)
      y = self.output_conv1(torch.cat([y, concat_256], 1))
      y = self.output_conv2(y)  
    
      y = self.up(y)
      y = self.output_conv3(torch.cat([y, concat_128], 1))
      y = self.output_conv4(y)
        
      y = self.up(y)
      y = self.output_conv5(torch.cat([y, concat_64], 1))
      y = self.output_conv6(y)
    
      y = self.up(y)
      y = self.output_conv7(torch.cat([y, concat_32], 1))
      y = self.output_conv8(y)
    
      output = self.final_conv(y)
    
      return output

### Training

In [None]:
'''
# The following is a basic training procedure to train the network
# You need to update the code to get the best performance
# TODO: approx ? lines
'''
train_logs = open('segmentation_train.txt', 'w')
# Set the hyperparameters
num_epochs = 60
batch_size = 4
learning_rate = 5e-3
weight_decay = 1e-5
# weight_decay = 0
val_losses = []
train_losses = []
model = MyModel() # initialize the model
model = model.cuda() # move the model to GPU

train_dataset = get_plane_dataset('train') # initialize data_loader
val_dataset = get_plane_dataset('val')

trainloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4,
                                          pin_memory=True, shuffle=True)

valloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4,
                                              pin_memory=True, shuffle=True)


crit = nn.BCEWithLogitsLoss() # Define the loss function
optim = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Initialize the optimizer as SGD
scheduler = torch.optim.lr_scheduler.MultiStepLR(optim, milestones=[10, 20, 40], gamma=0.1)
# start the training procedure
for epoch in range(num_epochs):
    
  model.train()
  train_loss = 0
  for (img, mask) in tqdm(trainloader):
    # img = torch.tensor(img, device=torch.device('cuda'), requires_grad = True)
    # mask = torch.tensor(mask, device=torch.device('cuda'), requires_grad = True)
    img = img.cuda()
    mask = mask.cuda()
    pred = model(img)
    pred = torch.squeeze(pred, 1)
    
    loss = crit(pred, mask)
    
    optim.zero_grad()
    loss.backward()
    optim.step()
    train_loss += loss.cpu().data
  scheduler.step()    
  val_loss = 0
  model.eval()
  with torch.no_grad():
    for (img, mask) in tqdm(valloader):
      img = img.cuda()
      mask = mask.cuda()
      pred = model(img)
      pred = torch.squeeze(pred, 1)
      loss = crit(pred, mask)
      val_loss += loss.cpu().data
  
  print(f"Epoch: {epoch}, Avg Batch Train Loss: {train_loss/len(trainloader)}, Avg Batch Validation Loss: {val_loss/len(valloader)}")
  train_losses.append(train_loss/len(trainloader))
  val_losses.append(val_loss/len(valloader))
  train_logs.write(f"Epoch: {epoch}, Avg Batch Train Loss: {train_loss/len(trainloader)}, Avg Batch Validation Loss: {val_loss/len(valloader)}\n")
  if((epoch%10) == 0):
      torch.save(model.state_dict(), '{}/output/{}_segmentation_model.pth'.format(BASE_DIR, epoch))

'''
# Saving the final model
'''
torch.save(model.state_dict(), '{}/output/final_segmentation_model.pth'.format(BASE_DIR))
train_logs.close()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,10))
epochs = range(0, 43)
plt.plot(epochs, val_losses, label="Validation Loss")
plt.plot(epochs, train_losses, label="Training loss")
plt.xticks(ticks=range(0, 50, 5))
# plt.yticks(ticks=)
plt.legend()
plt.show()

### Evaluation and Visualization

In [None]:
'''
# Before starting the evaluation, you need to set the model mode to eval
# You may load the trained model again, in case if you want to continue your code later
# TODO: approx 15 lines
'''

model = MyModel().cuda()
model.load_state_dict(torch.load('{}/output/final_segmentation_model.pth'.format(BASE_DIR)))
model = model.eval() # chaning the model to evaluation mode will fix the bachnorm layers

def get_ious(set_name):
    dataset = get_plane_dataset(set_name)
    batch_size = 32
    loader = DataLoader(dataset, batch_size=batch_size, num_workers=0,
                                              pin_memory=False, shuffle=False)
    img_ious = torch.empty(0, device="cuda:0")
    for (img, mask) in tqdm(loader):
      with torch.no_grad():
        img = img.cuda()
        mask = mask.cuda()
        pred = model(img)
        # pred = torch.sigmoid(pred)
        # mask = torch.squeeze(mask, 0)
        pred = torch.squeeze(pred, 1)
        max_value = torch.max(pred)
        min_value = torch.min(pred)
        pred = (pred >= (max_value+min_value)/2)
        intersection = torch.logical_and(mask, pred).sum(dim=(1, 2))
        union = torch.logical_or(mask, pred).sum(dim=(1, 2))
        batch_ious = (intersection/union)
        img_ious = torch.cat((img_ious, batch_ious))

        '''
        ## Complete the code by obtaining the IoU for each img and print the final Mean IoU
        '''

    img_count = len(img_ious)
    avg_iou = img_ious.sum()/img_count
    set_name=set_name.capitalize()
    print(f"{set_name} #images: {img_count}, Mean IoU: {avg_iou}")
    with open(f"log_iou_{set_name}.txt", "w") as writer:
        writer.write(f"{set_name}: # images: {img_count}, Mean IoU: {avg_iou}")
    writer.close()
get_ious("train")
get_ious("val")

In [None]:
'''
# Visualize 3 sample outputs
# TODO: approx 5 lines
'''
from torchvision.transforms import functional as F
from torchvision.utils import save_image
import matplotlib.pyplot as plt

model = MyModel().cuda()
model.load_state_dict(torch.load('{}/output/final_segmentation_model.pth'.format(BASE_DIR)))
model = model.eval() # chaning the model to evaluation mode will fix the bachnorm layers

test_data = DatasetCatalog.get("data_detection_{}".format("test"))

to_pil_transform = transforms.ToPILImage()

num_displayed = 0
plt.figure(figsize=(10,15))
with torch.no_grad():
    for d in random.sample(test_data, 3):
        if(num_displayed == 3):
            break
        img = cv2.imread(d["file_name"])
        # plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        # plt.show()
        regions = region_predictor(img)
        v = Visualizer(img[:, :, ::-1],
                   metadata=plane_metadata, 
                   scale=0.5, 
                   instance_mode=ColorMode.IMAGE_BW   # remove the colors of unsegmented pixels. This option is only available for segmentation models
        )
        
        regions = regions["instances"].pred_boxes
        
        for region in regions:
            if(num_displayed==3):
                break
            region = region.cpu().numpy()

            region_img = img[int(region[1]):int(region[3]),int(region[0]): int(region[2])]
            region_img = cv2.resize(region_img, (256, 256), interpolation=cv2.INTER_NEAREST)
            plt.subplot(3, 2, 2 * num_displayed + 1)
            plt.imshow(region_img)
            # plt.imshow(cv2.cvtColor(region_img, cv2.COLOR_BGR2RGB))
            # plt.show()
            region_img = F.to_tensor(region_img).unsqueeze(0)
            region_img = region_img.cuda()
            pred = model(region_img)
            # 
            # pred = torch.sigmoid(pred)
            max_value = torch.max(pred)
            min_value = torch.min(pred)
            pred = (pred >= (max_value+min_value)/2)
            # pred[pred >= 0.5] = 1
            # pred[pred < 0.5] = 0
            pred[pred <= 0] = 0
            pred[pred >0] = 1
            pred = torch.squeeze(pred)
            plt.subplot(3, 2, 2 * num_displayed + 2)
            plt.imshow(pred.cpu().numpy(), cmap="gray")
            num_displayed += 1
        

## Part 3: Instance Segmentation

In this part, you need to obtain the instance segmentation results for the test data by using the trained segmentation model in the previous part and the detection model in Part 1.

### Get Prediction

In [None]:
'''
# Define a new function to obtain the prediction mask by passing a sample data
# For this part, you need to use all the previous parts (predictor, get_instance_sample, data preprocessings, etc)
# It is better to keep everything (as well as the output of this funcion) on gpu as tensors to speed up the operations.
# pred_mask is the instance segmentation result and should have different values for different planes.
# TODO: approx 35 lines
'''

model = MyModel().cuda()
model.load_state_dict(torch.load('{}/output/final_segmentation_model.pth'.format(BASE_DIR)))
model = model.eval() # changing the model to evaluation mode will fix the bachnorm layers


def get_prediction_mask(data):
  img = cv2.imread(data["file_name"])
  with torch.no_grad():
      all_predictions = []
      all_scores = []

      image_height, image_width = img.shape[0:2]
      patch_coordinates = calculate_patch_bboxes(image_height, image_width, (800, 800))
      pred_mask = np.zeros((image_height, image_width))
      gt_mask = np.zeros((image_height, image_width))
        
      for p in patch_coordinates:
        patch = img[p[2]: p[3], p[0]: p[1], :]
        output = region_predictor(patch)
        pred_boxes = output["instances"].pred_boxes
        confidence_scores = output["instances"].scores
        for patch_box, confidence_score in zip(pred_boxes, confidence_scores):
          patch_box = patch_box.tolist()
          patch_box = [patch_box[0] + p[0], patch_box[1] + p[2], patch_box[2] + p[0], patch_box[3] + p[2]]
          all_predictions.append(patch_box)
          all_scores.append(confidence_score.item())
      
      regions, _ = supress_bounding_boxes(all_predictions, all_scores)
        
      for region_id, region in enumerate(regions):
        x1 = round(region[0])
        y1 = round(region[1])
        x2 = round(region[2])
        y2 = round(region[3])
        region_img = img[y1:y2,x1:x2]
        region_img = cv2.resize(region_img, (128, 128), interpolation=cv2.INTER_NEAREST)
        region_img = F.to_tensor(region_img).unsqueeze(0)
        region_img = region_img.cuda()
        pred = model(region_img)
        max_value = torch.max(pred)
        min_value = torch.min(pred)
        pred = (pred >= (max_value+min_value)/2).long()
        # print(pred.shape)
        pred = torch.squeeze(pred)
        # print(pred.shape)
        # print(pred)
        pred = cv2.resize(pred.cpu().numpy().astype('float32'), (x2-x1, y2-y1), interpolation=cv2.INTER_LINEAR)
        instance_mask = (region_id + 1) * pred
        pred_mask[y1:y2, x1:x2] = instance_mask
        
        if(data["annotations"]):    
            annotations = data["annotations"]
            for annotation in annotations:
                # print("in")
                x, y, w, h = annotation["bbox"]
                gtx1 = round(x)
                gty1 = round(y)
                gtx2 = gtx1 + round(w)
                gty2 = gty1 + round(h)
                gt_region = img[gty1:gty2, gtx1:gtx2]
                image_mask = GenericMask(annotation["segmentation"], image_height, image_width).mask
                region_mask = image_mask[gty1:gty2, gtx1:gtx2]
                gt_mask[gty1:gty2, gtx1:gtx2] = region_mask
  img = torch.tensor(img, device=torch.device('cuda'))              
  gt_mask = torch.tensor(gt_mask, device=torch.device('cuda'))      
  pred_mask = torch.tensor(pred_mask, device=torch.device('cuda'))
  return img, gt_mask, pred_mask # gt_mask could be all zero when the ground truth is not given.


### Visualization and Submission

In [None]:
'''
# Visualise the output prediction as well as the GT Mask and Input image for a sample input
# TODO: approx 10 lines
'''
my_data_list = DatasetCatalog.get("data_detection_{}".format('train'))
sample = np.random.choice(my_data_list)

img, gt_mask, pred_mask = get_prediction_mask(sample)
plt.figure(figsize=(20, 15))

plt.subplot(1, 3, 1)
plt.imshow(img.cpu())
plt.subplot(1, 3, 2)
plt.imshow(gt_mask.cpu(), cmap="gray")
plt.subplot(1, 3, 3)
plt.imshow(pred_mask.cpu(), cmap="gray")
plt.show()

In [None]:
my_data_list = DatasetCatalog.get("data_detection_{}".format('test'))
sample = np.random.choice(my_data_list)

img, _, pred_mask = get_prediction_mask(sample)
plt.figure(figsize=(20, 15))

plt.subplot(1, 2, 1)
plt.imshow(img.cpu())
plt.subplot(1, 2, 2)
plt.imshow(pred_mask.cpu(), cmap="gray")
plt.show()

sample = np.random.choice(my_data_list)

img, _, pred_mask = get_prediction_mask(sample)
plt.figure(figsize=(20, 15))

plt.subplot(1, 2, 1)
plt.imshow(img.cpu())
plt.subplot(1, 2, 2)
plt.imshow(pred_mask.cpu(), cmap="gray")
plt.show()

sample = np.random.choice(my_data_list)

img, _, pred_mask = get_prediction_mask(sample)
plt.figure(figsize=(20, 15))

plt.subplot(1, 2, 1)
plt.imshow(img.cpu())
plt.subplot(1, 2, 2)
plt.imshow(pred_mask.cpu(), cmap="gray")
plt.show()

In [None]:
'''
# ref: https://www.kaggle.com/rakhlin/fast-run-length-encoding-python
# https://www.kaggle.com/c/airbus-ship-detection/overview/evaluation
'''
def rle_encoding(x):
    '''
    x: pytorch tensor on gpu, 1 - mask, 0 - background
    Returns run length as list
    '''
    dots = torch.where(torch.flatten(x.long())==1)[0]
    if(len(dots)==0):
      return []
    inds = torch.where(dots[1:]!=dots[:-1]+1)[0]+1
    inds = torch.cat((torch.tensor([0], device=torch.device('cuda'), dtype=torch.long), inds))
    tmpdots = dots[inds]
    inds = torch.cat((inds, torch.tensor([len(dots)], device=torch.device('cuda'))))
    inds = inds[1:] - inds[:-1]
    runs = torch.cat((tmpdots, inds)).reshape((2,-1))
    runs = torch.flatten(torch.transpose(runs, 0, 1)).cpu().data.numpy()
    return ' '.join([str(i) for i in runs])

In [None]:
'''
# You need to upload the csv file on kaggle
# The speed of your code in the previous parts highly affects the running time of this part
'''

preddic = {"ImageId": [], "EncodedPixels": []}

'''
# Writing the predictions of the training set
'''
my_data_list = DatasetCatalog.get("data_detection_{}".format('train')) + DatasetCatalog.get("data_detection_{}".format('val'))
for i in tqdm(range(185, len(my_data_list)), position=0, leave=True):
  sample = my_data_list[i]
  sample['image_id'] = sample['file_name'].split("/")[-1][:-4]
  img, true_mask, pred_mask = get_prediction_mask(sample)
  inds = torch.unique(pred_mask)
  if(len(inds)==1):
    preddic['ImageId'].append(sample['image_id'])
    preddic['EncodedPixels'].append([])
  else:
    for index in inds:
      if(index == 0):
        continue
      tmp_mask = (pred_mask==index)
      encPix = rle_encoding(tmp_mask)
      preddic['ImageId'].append(sample['image_id'])
      preddic['EncodedPixels'].append(encPix)

'''
# Writing the predictions of the test set
'''

my_data_list = DatasetCatalog.get("data_detection_{}".format('test'))
for i in tqdm(range(len(my_data_list)), position=0, leave=True):
  sample = my_data_list[i]
  sample['image_id'] = sample['file_name'].split("/")[-1][:-4]
  img, true_mask, pred_mask = get_prediction_mask(sample)
  inds = torch.unique(pred_mask)
  if(len(inds)==1):
    preddic['ImageId'].append(sample['image_id'])
    preddic['EncodedPixels'].append([])
  else:
    for j, index in enumerate(inds):
      if(index == 0):
        continue
      tmp_mask = (pred_mask==index).double()
      encPix = rle_encoding(tmp_mask)
      preddic['ImageId'].append(sample['image_id'])
      preddic['EncodedPixels'].append(encPix)

pred_file = open("{}/pred.csv".format(BASE_DIR), 'w')
pd.DataFrame(preddic).to_csv(pred_file, index=False)
pred_file.close()


## Part 4: Mask R-CNN

For this part you need to follow a same procedure to part 2 with the configs of Mask R-CNN, other parts are generally the same as part 2.

### Data Loader

In [None]:
# dataset = DatasetCatalog.get("data_detection_train")
# from torch.utils import data
# total_len = len(dataset)
# train_len = int(0.85 * total_len)
# val_len = total_len - train_len
# train_dataset, val_dataset = data.random_split(dataset, [train_len, val_len])

## Reusing the same Dataloaders as in Part 1

### Network

In [None]:
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("data_detection_train",)
cfg.DATASETS.TEST = ("data_detection_val",)
cfg.DATALOADER.NUM_WORKERS = 0
cfg.SOLVER.IMS_PER_BATCH = 2  # This is the real "batch size" commonly known to deep learning people
cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR
cfg.SOLVER.MAX_ITER = 500    # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
cfg.SOLVER.STEPS = []        # do not decay learning rate
# cfg.MODEL.DEVICE="cuda"
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
# cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512   # The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)

cfg.OUTPUT_DIR = "{}/output/maskrcnn".format(BASE_DIR)
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)




### Training

In [None]:
maskrcnn_trainer = DefaultTrainer(cfg)
maskrcnn_trainer.resume_or_load(resume=False)
maskrcnn_trainer.train()

### Evaluation and Visualization

In [None]:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6                             # 0.7

mask_predictor = DefaultPredictor(cfg)

In [None]:
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader

evaluator = COCOEvaluator("data_detection_val", output_dir=OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "data_detection_val")
inference_on_dataset(region_predictor.model, val_loader, evaluator)

In [None]:

from detectron2.utils.visualizer import ColorMode
import matplotlib.pyplot as plt

dataset_dicts = get_detection_data("test")
SAMPLE_IMAGES_DIR = f"{OUTPUT_DIR}/detection_test_images_sample"
os.makedirs(SAMPLE_IMAGES_DIR, exist_ok=True)
for d in random.sample(dataset_dicts, 3):    
    im = cv2.imread(d["file_name"])
    outputs = mask_predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    v = Visualizer(im[:, :, ::-1],
                   metadata=plane_metadata, 
                   scale=0.5, 
                   instance_mode=ColorMode.IMAGE_BW   # remove the colors of unsegmented pixels. This option is only available for segmentation models
    )
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    # plt.figure(figsize=(5,5))
    plt.imshow(cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_BGR2RGB), aspect="auto")
    plt.show()

### Improvements

In [None]:
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("data_detection_train",)
cfg.DATASETS.TEST = ("data_detection_val",)
cfg.DATALOADER.NUM_WORKERS = 0
cfg.SOLVER.IMS_PER_BATCH = 2  # This is the real "batch size" commonly known to deep learning people
cfg.SOLVER.BASE_LR = 0.0005  # pick a good LR
cfg.SOLVER.MAX_ITER = 3000    # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
cfg.SOLVER.STEPS = [1500, 2500]        # do not decay learning rate
# cfg.MODEL.DEVICE="cuda"
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.MODEL.PIXEL_STD = [57.375, 57.120, 58.395] #ImageNet std as mentioned in detectron2 docs
# cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512   # The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)

cfg.OUTPUT_DIR = "{}/output/maskrcnn".format(BASE_DIR)
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

In [None]:
from detectron2.data import DatasetMapper, build_detection_train_loader, transforms as T

class PlaneDetectionTrainer(DefaultTrainer):
    
    @classmethod
    def build_train_loader(cls, cfg):
        return build_detection_train_loader(cfg,
                mapper = DatasetMapper(cfg, is_train=True, augmentations=[
                    T.RandomCrop(
                        crop_type="absolute_range",
                        crop_size=(512, 1024)
                    ),
                    T.RandomBrightness(0.9, 1.1),
                    T.RandomFlip(horizontal=True, vertical=False),
                    T.RandomFlip(horizontal=False, vertical=True)
                ]))
    
    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = cfg.OUTPUT_DIR
            os.makedirs(output_folder, exist_ok=True)
        return COCOEvaluator("data_detection_val", output_dir=output_folder)

In [None]:
maskrcnn_trainer = PlaneDetectionTrainer(cfg)
maskrcnn_trainer.resume_or_load(resume=False)
maskrcnn_trainer.train()

In [None]:
%load_ext tensorboard
%tensorboard --logdir output/mask_rcnn

In [None]:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.01                            # 0.7

mask_predictor = DefaultPredictor(cfg)

In [None]:
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0
evaluator = COCOEvaluator("data_detection_val", output_dir=OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "data_detection_val")
inference_on_dataset(mask_predictor.model, val_loader, evaluator)

In [None]:

from detectron2.utils.visualizer import ColorMode
import matplotlib.pyplot as plt

dataset_dicts = get_detection_data("test")
SAMPLE_IMAGES_DIR = f"{OUTPUT_DIR}/detection_test_images_sample"
os.makedirs(SAMPLE_IMAGES_DIR, exist_ok=True)
for d in random.sample(dataset_dicts, 3):    
    im = cv2.imread(d["file_name"])
    outputs = mask_predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    v = Visualizer(im[:, :, ::-1],
                   metadata=plane_metadata, 
                   scale=0.5, 
                   instance_mode=ColorMode.IMAGE_BW   # remove the colors of unsegmented pixels. This option is only available for segmentation models
    )
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    # plt.figure(figsize=(5,5))
    plt.imshow(cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_BGR2RGB), aspect="auto")
    plt.show()

In [None]:
print("End of Project 3")