mindee · fg-mindee · Jun 10, 2021 · Jun 5, 2021 · Jun 5, 2021 · Jun 5, 2021
diff --git a/doctr/datasets/cord.py b/doctr/datasets/cord.py
@@ -28,6 +28,7 @@ class CORD(VisionDataset):
     Args:
         train: whether the subset should be the training one
         sample_transforms: composable transformations that will be applied to each image
+        rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones)
         **kwargs: keyword arguments from `VisionDataset`.
     """
     TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip',
@@ -53,6 +54,7 @@ def __init__(
         self.train = train
         self.sample_transforms = sample_transforms
         for img_path in os.listdir(self.root):
+            # File existence check
             if not os.path.exists(os.path.join(self.root, img_path)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
             stem = Path(img_path).stem
@@ -64,23 +66,25 @@ def __init__(
                         if len(word["text"]) > 0:
                             x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
                             y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
-                            if not rotated_bbox:
-                                # Reduce 8 coords to 4
-                                left, right = min(x), max(x)
-                                top, bot = min(y), max(y)
-                                _targets.append((word["text"], [left, top, right, bot]))
-                            else:
+                            if rotated_bbox:
                                 x, y, w, h, alpha = fit_rbbox(np.array([
                                     [x[0], y[0]],
                                     [x[1], y[1]],
                                     [x[2], y[2]],
                                     [x[3], y[3]],
-                                ], np.float32))
-                                _targets.append((word["text"], [x, y, w, h, alpha]))
+                                ], dtype=np.float32))
+                                box = [x, y, w, h, alpha]
+                            else:
+                                # Reduce 8 coords to 4
+                                box = [min(x), min(y), max(x), max(y)]
+                            _targets.append((word['text'], box))
 
             text_targets, box_targets = zip(*_targets)
 
-            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.int), labels=text_targets)))
+            self.data.append((
+                img_path,
+                dict(boxes=np.asarray(box_targets, dtype=np.int).clip(min=0), labels=text_targets)
+            ))
 
     def extra_repr(self) -> str:
         return f"train={self.train}"
diff --git a/doctr/datasets/detection.py b/doctr/datasets/detection.py
@@ -27,6 +27,7 @@ class DetectionDataset(AbstractDataset):
         img_folder: folder with all the images of the dataset
         label_folder: folder with all the corresponding labels (stem needs to be identical)
         sample_transforms: composable transformations that will be applied to each image
+        rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones)
     """
     def __init__(
         self,
@@ -40,17 +41,18 @@ def __init__(
 
         self.data: List[Tuple[str, Dict[str, Any]]] = []
         for img_path in os.listdir(self.root):
+            # File existence check
             if not os.path.exists(os.path.join(self.root, img_path)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
             with open(os.path.join(label_folder, img_path + '.json'), 'rb') as f:
                 boxes = json.load(f)
             bboxes = np.asarray(boxes["boxes_1"] + boxes["boxes_2"] + boxes["boxes_3"], dtype=np.float32)
-            if not rotated_bbox:
-                # Switch to xmin, ymin, xmax, ymax
-                bboxes = np.concatenate((bboxes.min(axis=1), bboxes.max(axis=1)), axis=1)
-            else:
+            if rotated_bbox:
                 # Switch to rotated rects
                 bboxes = np.asarray([list(fit_rbbox(box)) for box in bboxes], dtype=np.float32)
+            else:
+                # Switch to xmin, ymin, xmax, ymax
+                bboxes = np.concatenate((bboxes.min(axis=1), bboxes.max(axis=1)), axis=1)
 
             is_ambiguous = [False] * (len(boxes["boxes_1"]) + len(boxes["boxes_2"])) + [True] * len(boxes["boxes_3"])
             self.data.append((img_path, dict(boxes=bboxes, flags=np.asarray(is_ambiguous))))

diff --git a/doctr/datasets/funsd.py b/doctr/datasets/funsd.py
@@ -27,6 +27,7 @@ class FUNSD(VisionDataset):
     Args:
         train: whether the subset should be the training one
         sample_transforms: composable transformations that will be applied to each image
+        rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones)
         **kwargs: keyword arguments from `VisionDataset`.
     """
 
@@ -53,6 +54,7 @@ def __init__(
         self.root = os.path.join(self._root, subfolder, 'images')
         self.data: List[Tuple[str, Dict[str, Any]]] = []
         for img_path in os.listdir(self.root):
+            # File existence check
             if not os.path.exists(os.path.join(self.root, img_path)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
             stem = Path(img_path).stem

diff --git a/doctr/datasets/ocr.py b/doctr/datasets/ocr.py
@@ -24,6 +24,7 @@ class OCRDataset(AbstractDataset):
         img_folder: local path to image folder (all jpg at the root)
         label_file: local path to the label file
         sample_transforms: composable transformations that will be applied to each image
+        rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones)
         **kwargs: keyword arguments from `VisionDataset`.
     """
 
@@ -47,6 +48,7 @@ def __init__(
         for file_dic in data:
             # Get image path
             img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg'
+            # File existence check
             if not os.path.exists(os.path.join(self.root, img_name)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
 
@@ -60,15 +62,14 @@ def __init__(
             for box in file_dic["coordinates"]:
                 if rotated_bbox:
                     x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32))
+                    box = [x, y, w, h, alpha]
                     is_valid.append(w > 0 and h > 0)
-                    if is_valid[-1]:
-                        box_targets.append([x, y, w, h, alpha])
                 else:
                     xs, ys = zip(*box)
                     box = [min(xs), min(ys), max(xs), max(ys)]
                     is_valid.append(box[0] < box[2] and box[1] < box[3])
-                    if is_valid[-1]:
-                        box_targets.append(box)
+                if is_valid[-1]:
+                    box_targets.append(box)
 
             text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid]
             self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
diff --git a/doctr/datasets/recognition.py b/doctr/datasets/recognition.py
@@ -39,6 +39,7 @@ def __init__(
         with open(labels_path) as f:
             labels = json.load(f)
         for img_path in os.listdir(self.root):
+            # File existence check
             if not os.path.exists(os.path.join(self.root, img_path)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
             label = labels.get(img_path)

diff --git a/doctr/datasets/sroie.py b/doctr/datasets/sroie.py
@@ -27,6 +27,7 @@ class SROIE(VisionDataset):
     Args:
         train: whether the subset should be the training one
         sample_transforms: composable transformations that will be applied to each image
+        rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones)
         **kwargs: keyword arguments from `VisionDataset`.
     """
 
@@ -55,6 +56,7 @@ def __init__(
         self.root = os.path.join(self._root, 'images')
         self.data: List[Tuple[str, Dict[str, Any]]] = []
         for img_path in os.listdir(self.root):
+            # File existence check
             if not os.path.exists(os.path.join(self.root, img_path)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
             stem = Path(img_path).stem