In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
from os import makedirs
from os.path import isfile, join
from pathlib import Path, PosixPath
import cv2
import bs4
from bs4 import BeautifulSoup
from shutil import copy2
from tqdm import tqdm
from tqdm.notebook import tqdm as ntqdm
from random import shuffle

from numpy.typing import ArrayLike
from typing import Any, Iterable, List, Tuple

plt.rcParams['figure.figsize'] = [20, 15]

In [2]:
"""
Yolo format 

dataset.yaml
├── images
│   ├── test
│   ├── train
│   └── val
└── labels
    ├── test
    ├── train
    └── val
    

"""


class Bbox:
    def __init__(self, xyxy: list | tuple, label: int):
        self.x1 = int(xyxy[0])
        self.y1 = int(xyxy[1])
        self.x2 = int(xyxy[2])
        self.y2 = int(xyxy[3])
        self.label = label
        # Fixme: Adapt label to COCO format in future.
        self.pt1 = (self.x1, self.y1)
        self.pt2 = (self.x2, self.y2)
        self.width = abs(self.x2 - self.x1)
        self.height = abs(self.y2 - self.y1)

    def to_coco(self) -> List[int]:
        return [self.x1, self.y1, self.width, self.height]

    def to_yolo(self, img_width: int, img_height: int) -> List[float]:
        x_cen = self.x1 + self.width/2
        y_cen = self.y1 + self.height/2
        x_cen = x_cen / img_width
        y_cen = y_cen / img_height
        width = self.width / img_width
        height = self.height / img_height
        return [x_cen, y_cen, width, height]
    
    def draw(self, img: ArrayLike, color: tuple = (255, 0, 0)) -> ArrayLike:
        img = cv2.rectangle(img, self.pt1, self.pt2, color, 2)
        return img
    
class Segment:
    def __init__(self, input_annot: List[float | int], label: int):
        """
        Args:
            input_annot(list): Segmentation mask points in form of "x1 y1 x2 y2 ... xn yn"
            label(int): The digit denoting the label of the item.
        """
        self.input_annot = input_annot
        self.pts = self.chunk(input_annot)
        self.label = label
    
    def chunk(self, arr: List) -> List[ArrayLike]:
        return [np.array(arr[i:i + 2]).reshape((-1,1,2)).astype(np.int32) for i in range(0, len(arr), 2)]

    def segment_to_yolo(self, img_w, img_h):
        text = ""
        for i, p in enumerate(self.input_annot):
            if i % 2 == 0: # x points / img_w
                text += f" {p/img_w:.3f}"
            else: # y points / img_h
                text += f" {p/img_h:.3f}"
        return text
    
    def get_bbox(self) -> Bbox:
        Xs = [int(item) for i, item in enumerate(self.input_annot) if i%2 == 0]
        Ys = [int(item) for i, item in enumerate(self.input_annot) if i%2 == 1]
        x1, y1 = max(Xs), max(Ys)
        x2, y2 = min(Xs), min(Ys)
        bbox = Bbox([x1, y1, x2, y2], self.label)
        return bbox
    
    def draw(self, img: ArrayLike, color: tuple = (0, 255, 0), draw_bbox = True) -> ArrayLike:
        img = cv2.drawContours(img, self.pts, -1, color, 3)
        bbox: Bbox = self.get_bbox()
        if draw_bbox:
            img = bbox.draw(img, color=color)
        return img

    
class ImageAnnot:
    # Fixme: Adapt the framework for training bbox annotated project. right now it is decent for segmentation project.
    # Decouple Segment with Bbox.
    def __init__(self, img_path: str | PosixPath):
        """
        img_path(str | pathlib.PosixPath): image path
        
        """
        self.img_path = Path(img_path)
        assert self.img_path.is_file()
        self.name = self.img_path.name
        self.segments = []
        self.img_w, self.img_h = self.get_img_size()
    
    def get_img_size(self):
        img = cv2.imread(self.img_path.as_posix())
        return img.shape[1], img.shape[0]
        
    def add_segment(self, segment: Segment) -> None:
        self.segments.append(segment)

    def get_yolo_format(self, only_bboxes=True):
        """
        the first five numbers still encode the class index and 
        bounding box information. The rest of the numbers encode 
        the boundary of the object that we are trying to segment. 
        Starting from the 6th number, we have space-separated 
        x-y coordinates for each point on the boundary of the 
        object for the segmentation mask.
        
        """
        temp = ""
        for i, item in enumerate(self.segments):
            label = item.label
            x_cen, y_cen, w, h = item.get_bbox().to_yolo(self.img_w, self.img_h)
            temp = f"{label:.3f} {x_cen:.3f} {y_cen:.3f} {w:.3f} {h:.3f}"
            
            if not only_bboxes:
                segment_txt = item.segment_to_yolo(self.img_w, self.img_h)
                temp += segment_txt

            if i != len(self.segments)-1:
                temp += '\n'
        return temp

    def save_labels(self, save_path: str = "base_dir", train: bool = True, only_bboxes: bool = True):
        output = self.get_yolo_format(only_bboxes=only_bboxes)
        file_name = self.img_path.stem
        
        if train:
            img_path = join(save_path, 'images', 'train')
            label_path = join(save_path, 'labels', 'train')
        else:
            img_path = join(save_path, 'images', 'val')
            label_path = join(save_path, 'labels', 'val')

        makedirs(label_path, exist_ok=True)
        makedirs(img_path, exist_ok=True)
        
        label_path = join(label_path, file_name + '.txt')
        copy2(self.img_path.as_posix(), img_path)
        with open(label_path, 'w') as file:
            file.write(output)
        
    def get_coco_format(self):
        pass

    def img_show(self, color: tuple = (0, 255, 0)):
        img = cv2.imread(self.img_path.as_posix())
        img = cv2.cvtColor(img, 4)
        for segment in self.segments:
            img = segment.draw(img, color=color, draw_bbox=True)
        return img

In [6]:
# Generate the images before any process for annotations.
# Demonstrate for both segmentation and detection at once

"""
json file:
{
    '/path/to/image_name.png':
        [
            {
                'bbox': [x1, y1, x2, y2],
                'segmentation': [x1 y1 x2 y2 ... xn yn],
                'label': 'ball'
            },
            {
                'bbox': [x1, y1, x2, y2],
                'segmentation': [x1 y1 x2 y2 ... xn yn],
                'label': 'ball'
            }
        ]
}



"""

def cvat_prepare_images_annots(data_file: str, video_file: str, output_path: str, only_bboxes: bool = False):
    assert os.path.isfile(data_file), f"the file {data_file} does not exist"
    data = open(data_file).read()
    cap = cv2.VideoCapture(video_file)
    assert cap.isOpened(), "video is not accessible"
    output_path = join(output_path, 'images', 'train')
    makedirs(output_path, exist_ok=True)
    
    # Read tags and extract the annotated frames.
    bs_data = BeautifulSoup(data, "xml")
    image_tags = bs_data.find_all('track')
    print(len(image_tags))
    annots_tags = [i for i in image_tags if i.polygon]
    img_annots = []
    
    for img in tqdm(annots_tags, desc='generating the frames'):
        fno = int(img['id'])
        file_name = f'{Path(video_file).stem}_frame_{fno}'
        file_path = join(output_path, file_name+'.png')
        if not Path(file_path).is_file():
            cap.set(1, fno)
            status, frame = cap.read()
            cv2.imwrite(file_path, frame)
            
        img_annot = ImageAnnot(file_path)
        # Convert to yolo
        polygons = [tag for tag in img.children if tag.name == 'polygon']
        for poly in polygons:
            points = poly['points']
            points = [int(float(item)) for item in points.replace(';', ' ').replace(',', ' ').split(' ')]
            segment = Segment(points, label=1)
            img_annot.add_segment(segment)

        img_annot.save_labels(save_path=output_path, train=True, only_bboxes=only_bboxes)

# data_file = '../input/videos/train/annotations/8.xml'
# video_file = '../input/videos/train/videos/8.mp4'
# output_path = '../new/8'

# cvat_prepare_images_annots(data_file=data_file , video_file=video_file, output_path= output_path, only_bboxes=True)

data_file = '../input/videos/train/annotations/9.xml'
video_file = '../input/videos/train/videos/9.webm'
output_path = '../new/9'

cvat_prepare_images_annots(data_file=data_file , video_file=video_file, output_path= output_path, only_bboxes=True)


178


generating the frames: 100%|█████████████████████████████████████████████████████████| 178/178 [00:27<00:00,  6.45it/s]


In [4]:
xml = '../input/videos/8.xml'
vid = '../input/videos/8.mp4'

with open(xml, 'r') as f:
    data = f.read()

# the beautifulsoup parser, storing
# the returned object 
bs_data = BeautifulSoup(data, "xml")
 
# Finding all instances of tag 
# `unique`
images = bs_data.find_all('image')
annotated_images = [i for i in images if i.polygon is not None]
print(len(images))
print(len(annotated_images))


t = annotated_images[0]
width = t['width']
height = t['height']
frame_id = t.id
points = t.polygon['points']


5001
319


In [5]:
t

<image height="1080" id="15050" name="frame_015050" width="1920">
<polygon label="ball" occluded="0" points="525.30,589.54;524.71,579.28;524.71,574.58;525.59,569.89;527.65,565.78;530.87,562.26;533.81,558.16;535.86,554.05;540.55,552.88;544.95,551.70;549.36,550.82;554.05,550.82;558.74,551.12;561.68,554.64;565.49,557.57;568.72,560.80;570.18,565.78;571.06,570.48;572.24,574.88;571.94,579.57;571.65,584.26;570.77,589.25;569.30,593.94;568.13,598.34;566.37,602.74;562.56,605.38;558.16,606.85;553.46,608.61;549.06,610.08;544.07,611.25;539.38,611.54;534.69,610.37;531.75,606.85;529.99,602.74;527.65,598.64" source="manual" z_order="0">
</polygon>
</image>

In [6]:
with open('../input/annotations.xml') as file:
    data = file.read()

bs_data = BeautifulSoup(data, "xml")

images = bs_data.find_all('image')
annotated_images = [i for i in images if i.polygon is not None]
print(len(images))
print(len(annotated_images))

3955
25


In [26]:
text2pts = lambda text_points: [int(float(item)) for item in [text_points.replace(';', ' ').replace(',', ' ').split(' ')]] 
find_children = lambda tag, name: [(name, child['label'], child['points']) for child in tag.children if child.name == name]
find_children(annotated_images[-1], 'polygon')
boxes = find_children(image, 'box')

[<polygon label="ball" occluded="0" points="466.10,131.80;463.98,129.67;461.85,127.73;460.79,125.25;460.61,122.60;461.68,120.12;463.09,117.82;465.21,116.05;467.51,114.46;470.52,113.58;473.53,114.29;476.00,115.52;478.13,117.65;480.42,119.42;481.49,121.89;481.13,124.54;479.19,126.67;476.36,127.73;473.53,127.55;470.87,127.73;470.17,130.38;468.04,132.33;465.39,132.50" source="manual" z_order="0">
 </polygon>,
 <polygon label="ball" occluded="0" points="467.52,186.20;460.19,184.37;453.58,182.90;445.15,181.80;437.81,179.23;432.31,175.93;427.54,171.17;426.81,164.56;430.11,159.43;435.98,155.76;442.58,154.29;450.65,152.46;456.15,152.09;463.49,152.09;470.09,153.19;476.32,153.19;482.93,152.83;490.63,152.46;496.50,152.46;504.20,152.83;509.70,155.03;515.20,157.96;519.24,162.00;521.80,167.86;522.17,174.83;521.07,180.70;515.94,186.20;508.23,189.50;502.00,192.07;495.76,193.91;490.26,194.64;484.03,196.11;478.16,196.11;472.29,193.17" source="manual" z_order="0">
 </polygon>]

In [28]:
[i for i in annotated_images[-1].find_all('polygon')]

<image height="720" id="2651" name="frame_002651" width="1280">
<polygon label="ball" occluded="0" points="466.10,131.80;463.98,129.67;461.85,127.73;460.79,125.25;460.61,122.60;461.68,120.12;463.09,117.82;465.21,116.05;467.51,114.46;470.52,113.58;473.53,114.29;476.00,115.52;478.13,117.65;480.42,119.42;481.49,121.89;481.13,124.54;479.19,126.67;476.36,127.73;473.53,127.55;470.87,127.73;470.17,130.38;468.04,132.33;465.39,132.50" source="manual" z_order="0">
</polygon>
<polygon label="ball" occluded="0" points="467.52,186.20;460.19,184.37;453.58,182.90;445.15,181.80;437.81,179.23;432.31,175.93;427.54,171.17;426.81,164.56;430.11,159.43;435.98,155.76;442.58,154.29;450.65,152.46;456.15,152.09;463.49,152.09;470.09,153.19;476.32,153.19;482.93,152.83;490.63,152.46;496.50,152.46;504.20,152.83;509.70,155.03;515.20,157.96;519.24,162.00;521.80,167.86;522.17,174.83;521.07,180.70;515.94,186.20;508.23,189.50;502.00,192.07;495.76,193.91;490.26,194.64;484.03,196.11;478.16,196.11;472.29,193.17" source="ma

In [14]:
xys = t.polygon['points'].replace(';', ' ').replace(',', ' ').split(' ')
xys = [int(float(item)) for item in xys]
xys

[525,
 589,
 524,
 579,
 524,
 574,
 525,
 569,
 527,
 565,
 530,
 562,
 533,
 558,
 535,
 554,
 540,
 552,
 544,
 551,
 549,
 550,
 554,
 550,
 558,
 551,
 561,
 554,
 565,
 557,
 568,
 560,
 570,
 565,
 571,
 570,
 572,
 574,
 571,
 579,
 571,
 584,
 570,
 589,
 569,
 593,
 568,
 598,
 566,
 602,
 562,
 605,
 558,
 606,
 553,
 608,
 549,
 610,
 544,
 611,
 539,
 611,
 534,
 610,
 531,
 606,
 529,
 602,
 527,
 598]