# Multi-Class Object Detection using Faster RCNN

In [None]:
# # install dependencies: (use cu101 because colab has CUDA 10.1)
# !pip install -U torch==1.5 torchvision==0.6 -f https://download.pytorch.org/whl/cu101/torch_stable.html 
# !pip install cython pyyaml==5.1
# !pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
# import torch, torchvision
# print(torch.__version__, torch.cuda.is_available())
# !gcc --version

In [None]:
# check pytorch installation: 
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())
# assert torch.__version__.startswith("1.9")   # please manually install torch 1.9 if Colab changes its default version

In [None]:
# !pip install detectron2==0.1.3 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.5/index.html

In [None]:
# Assumption: matplotlib, numpy, opencv are installed

In [None]:
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import matplotlib.pyplot as plt
from natsort import natsorted
import numpy as np
import glob
import os, json, cv2, random
import pandas as pd
from tqdm import tqdm

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog

### Text Language Identification

In [None]:
from IPython.display import Image
Image(url="https://miro.medium.com/max/2000/1*2c5DytyDMLlYma3nqYCbLg.png", width=800, height=400)

Here is the link to the dataset:

https://github.com/aakarsh7599/Text-Detection-using-Detectron2/blob/master/Custom_Dataset_Format_to_COCO_Format_Conversion.ipynb

(https://drive.google.com/file/d/1gZW8WiQz5UYPXo97nmcP7AI8dHH1yqPM/view?usp=sharing)

### Raw annotations in the dataset

* This dataset consists of 428 real images in the image folder. Annotation corresponding to image presents in Annotation folder. Out of which 401 images are used for training and the remaining 27 images are used for validation.
* The Annotation for the image has the same name that of the image just with the difference of extension. For example, if the image name is "1.jpg" then the corresponding annotation will be "1.txt".
* The format for the storage of the annotation file is as described here:
 *  The no. of the lines in annotation text file denotes no of bounding box present in that image.
 *  A single line represents a single bounding box. format is as follow x1, x2, x3, x4, y1, y2, y3, y4, Language. Where (x1,y1) is the top left, (x2,y2) is top right, (x3,y3) bottom right, (x4,y4) bottom left.
 *  The order of point is in the clockwise order starting from the top-left points.

We must change this format to **COCO** format to make it compatible with detectron2, i.e. this form:

![](json_coco_annotation_example.png)

### Convert the raw annotations to COCO-friendly annotation format

In [None]:
cat_dict= {"HINDI":"0", "ENGLISH":"1", "OTHER":"2"}
header_list = ["x1", "x2", "x3", "x4", "y1", "y2", "y3", "y4", "category_id"]

for split in ["Train", "Val"]:
    # Instantiate the json_records
    json_records = pd.DataFrame(columns=["file_name", "height", "width", "annotations"])
    json_records["annotations"] = json_records["annotations"].astype('object')
    
    # list the annotation text files (one file for each image)
    file_list = natsorted(glob.glob(f'data/Text Detection Dataset/{split}/Annotations/*.txt'))
    
    # Iterate through the above list, file by file
    for k, filepath in tqdm(enumerate(file_list), desc=split, total=len(file_list)):
        # Read all objects (lines) in this file at once into a dataframe
        df = pd.read_csv(filepath, header=None, index_col=False, names=header_list)
        
        # Convert the dataframe format to a COCO compatible format (BoxMode.XYWH_ABS)
        df["height"] = abs(df["y1"] - df["y3"])
        df["width"] = abs(df["x1"] - df["x3"])
        df = df[["x1", "y1", "width", "height", "category_id"]]

        # For each row/object, pack ("x1", "y1", "width", "height") into a `list` variable 
        # and put it into a new column named "bbox"
        df["bbox"] = df.iloc[:, 0:4].values.tolist()
        
        df["bbox_mode"] = 1  # BoxMode.XYWH_ABS
        
        # Replace the string labels with integer labels
        df = df.replace({"category_id": cat_dict})
        
#         print(df.category_id, os.path.basename(filepath))
        
        
        # Only keep these three columns required by COCO
        df = df[["bbox", "bbox_mode", "category_id"]]
        
        # Serialise this dataframe of object bounding boxes into a list
        annotations = df.T.to_dict().values()
        bboxes = []
        for j in annotations:
            bboxes.append(j)
            
        # Define a record for the current image/filepath, and add the required information
        # Required information: file_name, annotations, img_height, img_width
        record = pd.DataFrame(columns=["file_name", "height", "width", "annotations"])
        record["annotations"] = record["annotations"].astype('object')
        file_name = os.path.basename(filepath)
        file_name = os.path.splitext(file_name)[0] + ".jpeg"
        record.at[0, "file_name"] = file_name
        record.at[0, "annotations"] = bboxes
        img = cv2.imread(f'data/Text Detection Dataset/{split}/Images/{file_name}')
        record.at[0, "height"] = img.shape[0]
        record.at[0, "width"] = img.shape[1]
        json_records = json_records.append(record)
        json_records.reset_index(drop=True, inplace=True)
        
    json_records.reset_index(inplace=True)
    json_records.rename(columns={"index": "image_id"}, inplace=True)
    json_records.to_json(f"data/Text Detection Dataset/{split}/coco_records.json", orient="records")

In [None]:
all_ids = [box['category_id'] for record in json_records['annotations'].tolist() for box in record]
pd.Series(all_ids).unique()

### Register the Datasets

In [None]:
print("A few annotation text files have some mistakes, they have excessive coordinate values in the bbox lines (10 numbers instead of 8 numbers)")

In [None]:
from detectron2.structures import BoxMode

def get_board_dicts(imgdir):
    json_file = imgdir + "/coco_records.json"  # Fetch the json file
    with open(json_file) as f:
        dataset_dicts = json.load(f)
    for record in dataset_dicts:
        file_name = record["file_name"] 
        record["file_name"] = os.path.join(imgdir, 'Images', file_name)
        for bbox in record["annotations"]:
            bbox["bbox_mode"] = BoxMode.XYWH_ABS  # Setting the required Box Mode
            bbox["category_id"] = int(bbox["category_id"])
            if bbox["category_id"] not in (0, 1, 2):
                print("Annotations have mistakes, they have excessive coordinate values in the bbox lines")
    return dataset_dicts

from detectron2.data import DatasetCatalog, MetadataCatalog
# Registering the Datasets
for split in ["Train", "Val"]:
    DatasetCatalog.register(f"board_language_{split}", lambda split=split: get_board_dicts(f"data/Text Detection Dataset/{split}"))
    MetadataCatalog.get(f"board_language_{split}").set(thing_classes=["HINDI", "ENGLISH", "OTHER"])
board_metadata = MetadataCatalog.get(f"board_language_Train")

#### Visualise some examples

In [None]:
dataset_dicts = get_board_dicts(f"data/Text Detection Dataset/Train")
for d in random.sample(dataset_dicts, 3):
    img = cv2.imread(d["file_name"])
    visualizer = Visualizer(img[:, :, ::-1], metadata=board_metadata, scale=0.5)
    out = visualizer.draw_dataset_dict(d)
    plt.imshow(out.get_image())
    plt.show()

### Data Augmentation

In [None]:
# from detectron2.data import detection_utils as utils
# import detectron2.data.transforms as T
# import copy

# def custom_mapper(dataset_dict):
#     dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
#     image = utils.read_image(dataset_dict["file_name"], format="BGR")
#     transform_list = [
#         T.Resize((800,600)),
#         T.RandomBrightness(0.8, 1.8),
#         T.RandomContrast(0.6, 1.3),
#         T.RandomSaturation(0.8, 1.4),
#         T.RandomRotation(angle=[90, 90]),
#         T.RandomLighting(0.7),
#         T.RandomFlip(prob=0.4, horizontal=False, vertical=True),
#     ]
#     image, transforms = T.apply_transform_gens(transform_list, image)
#     dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))

#     annos = [
#         utils.transform_instance_annotations(obj, transforms, image.shape[:2])
#         for obj in dataset_dict.pop("annotations")
#         if obj.get("iscrowd", 0) == 0
#     ]
#     instances = utils.annotations_to_instances(annos, image.shape[:2])
#     dataset_dict["instances"] = utils.filter_empty_instances(instances)
#     return dataset_dict

## Training

In [None]:
from detectron2.engine import DefaultTrainer
from detectron2.evaluation import COCOEvaluator
from detectron2.data import build_detection_test_loader, build_detection_train_loader

class CocoTrainer(DefaultTrainer):

    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            os.makedirs("coco_eval", exist_ok=True)
            output_folder = "coco_eval"
        return COCOEvaluator(dataset_name, cfg, False, output_folder)
    
#     # Data Augmentation overloader
#     @classmethod
#     def build_train_loader(cls, cfg):
#         return build_detection_train_loader(cfg, mapper=custom_mapper)

#### Set the Detectron Configurations

In [None]:
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
import os

cfg = get_cfg()

# Get the basic model configuration from the model zoo 
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))

# Passing the Train and Validation sets
cfg.DATASETS.TRAIN = ("board_language_Train",)
cfg.DATASETS.TEST = ("board_language_Val",)

# Number of data loading threads
cfg.DATALOADER.NUM_WORKERS = 4

# Let training initialize from model zoo
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")  

cfg.SOLVER.IMS_PER_BATCH = 4
cfg.SOLVER.BASE_LR = 0.0125
cfg.SOLVER.MAX_ITER = 1500
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 256  
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 3 
cfg.TEST.EVAL_PERIOD = 500
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

In [None]:
trainer = CocoTrainer(cfg) 
trainer.resume_or_load(resume=False)
trainer.train()

In [None]:
%load_ext tensorboard
%tensorboard --logdir output

### Inference on example images

In [None]:
from detectron2.utils.visualizer import ColorMode

# Use the final weights generated after successful training for inference  
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")

# set the testing threshold for this model
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8  

# Pass the validation dataset
cfg.DATASETS.TEST = ("board_language_Val", )

predictor = DefaultPredictor(cfg)

dataset_dicts = get_board_dicts("data/Text Detection Dataset/Val")
for d in random.sample(dataset_dicts, 3):    
    im = cv2.imread(d["file_name"])
    outputs = predictor(im)
    v = Visualizer(im[:, :, ::-1],
                   metadata=board_metadata, 
                   scale=0.8,
                   instance_mode=ColorMode.IMAGE   
    )
    v = v.draw_instance_predictions(outputs["instances"].to("cpu")) 
    plt.imshow(v.get_image())
    plt.show()

In [None]:
# import the COCO Evaluator to use the COCO Metrics
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader

# Call the COCO Evaluator function and pass the Validation Dataset
evaluator = COCOEvaluator("board_language_Val", cfg, False, output_dir="output/")
val_loader = build_detection_test_loader(cfg, "board_language_Val")

# Use the created predicted model in the previous step
inference_on_dataset(predictor.model, val_loader, evaluator)