In [None]:
import glob
import shutil
import json
from pathlib import Path

from collections import Counter

import pandas as pd
import xml.etree.ElementTree as ET

import cv2
import numpy as np
from PIL import Image

# Labelmg to COCO format

Labelmg by default creates files in the **Pascal VOC** format. Most of the latest pipelines are
expecting the labels in COCO format.

1. Pascal VOC format -> coordinates are represented as `(left_top, right_bottom)`
2. Labelmg tool produces Pascal voc format.
3. COCO expects all the file names should be in number format
4. COCO files

In [None]:
train_dir = "/home/haridas/projects/mystique/data/pic2card_dataset_nov_6/train/"
test_dir = "/home/haridas/projects/mystique/data/pic2card_dataset_nov_6/val/"
template_test = "/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/templates_test_data_coco/"

In [None]:
tree = ET.parse(f"{template_test}/1.xml")
root = tree.getroot()
fn_child = root.find("filename")

In [None]:
fn_child.text = "1.jpg"

In [None]:
# print(ET.tostring(root).decode('utf8'))

In [None]:

def renamefn_to_intfn(data_dir, start=1000):
    """
    @param data_dir: Pascal VOC format generated by labelmg.
    @param start: File name start point.
    """
    get_fn = lambda x: ".".join(x.split(".")[:-1])

    pp = Path(data_dir)
    for fn in glob.glob(f"{data_dir}/*.xml"):
        p = Path(fn)
        root = ET.parse(fn).getroot()
        fn_child = root.find("filename")
        path_child = root.find("path")
        img_fn = fn_child.text
        
        if not get_fn(p.name).isdigit():
            bname = ".".join(p.name.split(".")[:-1])
            png = Path(pp / f"{img_fn}")
            assert png.exists()
            
            imgfn_split = img_fn.split(".")
            img, img_ext = ".".join(imgfn_split[:-1]), imgfn_split[-1]
            
            p.rename(pp / f"{start}.xml")
            png.rename(pp / f"{start}.{img_ext}")
            
            # Update the filename reference in new xml 
            fn_child.text = f"{start}.{img_ext}"
            path_child.text = f"{pp/str(start)}.{img_ext}"
            
            with open(pp/f"{start}.xml", 'w') as f:
                f.write(ET.tostring(root).decode("utf8"))
            
            start += 1

In [None]:
renamefn_to_intfn(template_test)

## Coco Category Check

Ensure the Dataset has correct labels and category ID mapping across train/val/test datasets.

In [None]:
train_ann_file = "/home/haridas/projects/mystique/data/pic2card_dataset_03_mar_2021/train_coco.json"
val_ann_file = "/home/haridas/projects/mystique/data/pic2card_dataset_03_mar_2021/val_coco.json"
test_ann_file = "/home/haridas/projects/mystique/data/pic2card_dataset_03_mar_2021/test_coco.json"

In [None]:
def bbox_aspect_ratio(bbox):
    "coco bbox convention"
    x1, y1, width, height = bbox
    return width / height


def bbox_area_distribution(ann_file):
    ann = json.loads(open(ann_file).read())
    bbox_areas = pd.DataFrame.from_records([i for i in ann['annotations']])
    bbox_areas['aspect_ratio'] = bbox_areas.bbox.apply(bbox_aspect_ratio)
    return bbox_areas

def image_meta(ann_file):
    ann = json.loads(open(ann_file).read())
    image_df = pd.DataFrame.from_records([i for i in ann['images']])
    return image_df

    
bbox_area = bbox_area_distribution(train_ann_file)
train_image_df = image_meta(train_ann_file)

In [None]:
train_image_df.describe(percentiles=np.arange(0, 1, 0.1))

In [None]:
bbox_area[bbox_area.area == 458216]

In [None]:
np.arange(0, 1, 0.1)

In [None]:
bbox_area.aspect_ratio.hist(bins=np.arange(0, 15, 1), xlabelsize=10, figsize=(20,10))
# bbox_area.aspect_ratio.plot(kind="pie", xticks=[0, 0.5, 0.1, 1.5], xlim=(0, 1.5))

In [None]:
# area_bbox.hist(column="area", bins=1000)

In [None]:
# area_bbox.hist?

In [None]:
def check_category_id(ann_file):
    ann = json.loads(open(ann_file).read())
    cat_map = {i["id"] : i["name"] for i in ann["categories"]}
    print(f"Number of images: {len(set([i['file_name'] for i in ann['images']]))}")
    print({(k, cat_map[k]): v for k, v in Counter([i["category_id"] for i in ann["annotations"]]).items()})
    return ann["categories"]

check_category_id(train_ann_file)

In [None]:
check_category_id(val_ann_file)

In [None]:
check_category_id(test_ann_file)

In [None]:
Counter([i["category_id"] for i in ann["annotations"]])

# Label statistics

In [None]:
train_df = pd.read_csv("/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/train_label.csv")

In [None]:
train_df.groupby("filename").count().describe()

## Morph font weight analysis

In [None]:
image = Image.open("/tmp/pic2card.png")

In [None]:
bbox_coord = (27.74326380342245, 58.549769282341, 284.94907945394516, 86.0945338010788)

In [None]:
cropped_img = image.crop(bbox_coord)

In [None]:
cropped_img

In [None]:
c_img = np.asarray(cropped_img)

In [None]:
gray = cv2.cvtColor(c_img, cv2.COLOR_BGR2GRAY)

In [None]:
Image.fromarray(gray)

In [None]:
_, bin_img = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)

In [None]:
Image.fromarray(bin_img)

In [None]:
area_img = np.count_nonzero(bin_img)
area_img

In [None]:
skel_img = np.zeros(bin_img.shape, np.uint8)

In [None]:
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))

In [None]:
Image.fromarray(bin_img)

In [None]:
morph_open = cv2.morphologyEx(bin_img, cv2.MORPH_OPEN, (3, 3))

In [None]:
Image.fromarray(morph_open)

In [None]:
tmp_img =cv2.subtract(bin_img, morph_open)

In [None]:
Image.fromarray(tmp_img)

In [None]:
eroded = cv2.erode(bin_img, kernel)

In [None]:
Image.fromarray(eroded)

In [None]:
Image.fromarray(skel_img)

In [None]:
skel_img = cv2.bitwise_or(skel_img, tmp_img)

In [None]:
Image.fromarray(skel_img)

In [None]:
def get_weight(image: Image, coords) -> str:
        """
        Extract the weight of the each words by
        skeletization applying morph operations on
        the input image
        @param image : input PIL image
        @param coords: list of coordinated from which
                       text and height should be extracted
        @return: weight
        """
        cropped_image = image.crop(coords)
        c_img = np.asarray(cropped_image)
        """
        if(image_height/image_width) < 1:
            y_scale = round((800/image_width), 2)
            x_scale = round((500/image_height), 2)
            c_img = cv2.resize(c_img, (0, 0), fx=x_scale, fy=y_scale)
        """
        gray = cv2.cvtColor(c_img, cv2.COLOR_BGR2GRAY)
        # Converting input image to binary format
        _, img = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
        area_of_img = np.count_nonzero(img)
        # creating an empty skeleton
        skel = np.zeros(img.shape, np.uint8)
        kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
        # Loop until erosion leads to thinning text in image to singular pixel
        images = []
        images.append(Image.fromarray(img))
        while True:
            morph_open = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
            temp = cv2.subtract(img, morph_open)
            eroded = cv2.erode(img, kernel)
            skel = cv2.bitwise_or(skel, temp)
            img = eroded.copy()
            
            images.append(Image.fromarray(skel))
            # if no white pixels left the image has been completely eroded
            if cv2.countNonZero(img) == 0:
                break
        # length of the lines in text
        area_of_skel = np.sum(skel)/255
        # width of line = area of the line / length of the line
        thickness = round(area_of_img/area_of_skel, 2)
        return thickness, images

In [None]:
think, inter_images = get_weight(image, bbox_coord)

In [None]:
inter_images[0]

In [None]:
np.count_nonzero()

In [None]:
np.asarray(inter_images[-1])

In [None]:
np.count_nonzero(bin_img)

In [None]:
bin_img.shape

In [None]:
proc_img = np.asarray(inter_images[-1])

In [None]:
Image.fromarray(proc_img)

In [None]:
proc_img[proc_img.nonzero()]

## Image Augmentation with Albumenations