open-mmlab · gaotongxiao · Nov 2, 2022 · Oct 24, 2022 · Oct 24, 2022 · Oct 26, 2022
diff --git a/dataset_zoo/wildreceipt/metafile.yml b/dataset_zoo/wildreceipt/metafile.yml
@@ -12,7 +12,7 @@ Paper:
 }
 '
 Data:
-  Website: https://github.com/cs-chan/Total-Text-Dataset
+  Website: https://download.openmmlab.com/mmocr/data/wildreceipt.tar
   Language:
     - English
   Scene:

diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py
@@ -25,6 +25,7 @@ class BaseDataConverter:
         dumper (Dict): Config dict for dumping the dataset files.
         nproc (int): Number of processes to process the data.
         task (str): Task of the dataset.
+        dataset_name (str): Dataset name.
         delete (Optional[List]): A list of files to be deleted after
             conversion.
     """
@@ -37,17 +38,21 @@ def __init__(self,
                  dumper: Dict,
                  nproc: int,
                  task: str,
-                 delete: Optional[List] = None):
+                 dataset_name: str,
+                 delete: Optional[List] = None,
+                 config_path: str = 'configs/'):
         assert isinstance(nproc, int) and nproc > 0, \
             'nproc must be a positive integer.'
         self.splits = splits
         self.data_root = data_root
         self.nproc = nproc
         self.task = task
+        self.dataset_name = dataset_name
         self.delete = delete
+        self.config_path = config_path
         self.img_dir = f'{task}_imgs'
         parser.update(dict(nproc=nproc))
-        dumper.update(dict(task=task))
+        dumper.update(dict(task=task, dataset_name=dataset_name))
         self.parser = DATA_PARSERS.build(parser)
         self.dumper = DATA_DUMPERS.build(dumper)
         gather_type = gatherer.pop('type')
@@ -62,6 +67,7 @@ def __init__(self,
     def __call__(self):
         """Process the data."""
         # Convert and dump annotations to MMOCR format
+        dataset_config = dict()
         for split in self.splits:
             print(f'Parsing {split} split...')
             # Gather the info such as file names required by parser
@@ -78,9 +84,49 @@ def __call__(self):
             samples = track_parallel_progress(func, samples, nproc=self.nproc)
             samples = self.add_meta(samples)
             # Dump annotation files
-            self.dumper.dump(samples, self.data_root, split)
+            dataset_config[split] = self.dumper.dump(samples, self.data_root,
+                                                     split)
+        self.generate_dataset_config(dataset_config)
         self.clean()
 
+    def generate_dataset_config(self, dataset_config: Dict) -> None:
+        """Generate dataset config file. Dataset config is a python file that
+        contains the dataset information.
+
+        Examples:
+        Generated dataset config
+        >>> ic15_rec_data_root = 'data/icdar2015/'
+        >>> ic15_rec_train = dict(
+        >>>     type='OCRDataset',
+        >>>     data_root=ic15_rec_data_root,
+        >>>     ann_file='textrecog_train.json',
+        >>>     test_mode=False,
+        >>>     pipeline=None)
+        >>> ic15_rec_test = dict(
+        >>>     type='OCRDataset',
+        >>>     data_root=ic15_rec_data_root,
+        >>>     ann_file='textrecog_test.json',
+        >>>     test_mode=True,
+        >>>     pipeline=None)
+
+        Args:
+            dataset_config (Dict): A dict contains the dataset config string of
+            each split.
+        """
+        if self.task == 'kie':
+            # Not supported yet
+            return
+        cfg_path = osp.join(self.config_path, self.task, '_base_', 'datasets',
+                            f'{self.dataset_name}.py')
+        if not osp.exists(cfg_path):
+            with open(cfg_path, 'w') as f:
+                f.write(
+                    f'{self.dataset_name}_{self.task}_data_root = \'{self.data_root}\'\n'  # noqa: E501
+                )
+            for split in self.splits:
+                with open(cfg_path, 'a') as f:
+                    f.write(dataset_config[split])
+
     @abstractmethod
     def pack_instance(self, sample: Tuple, split: str) -> Dict:
         """Pack the parsed annotation info to an MMOCR format instance.
@@ -178,6 +224,7 @@ class TextDetDataConverter(BaseDataConverter):
         gatherer (Dict): Config dict for gathering the dataset files.
         parser (Dict): Config dict for parsing the dataset files.
         dumper (Dict): Config dict for dumping the dataset files.
+        dataset_name (str): Name of the dataset.
         nproc (int): Number of processes to process the data.
         delete (Optional[List]): A list of files to be deleted after
             conversion. Defaults to ['annotations].
@@ -189,6 +236,7 @@ def __init__(self,
                  gatherer: Dict,
                  parser: Dict,
                  dumper: Dict,
+                 dataset_name: str,
                  nproc: int,
                  delete: List = ['annotations']) -> None:
         super().__init__(
@@ -197,6 +245,7 @@ def __init__(self,
             gatherer=gatherer,
             parser=parser,
             dumper=dumper,
+            dataset_name=dataset_name,
             nproc=nproc,
             delete=delete,
             task='textdet')
@@ -272,6 +321,7 @@ class TextSpottingDataConverter(BaseDataConverter):
         gatherer (Dict): Config dict for gathering the dataset files.
         parser (Dict): Config dict for parsing the dataset files.
         dumper (Dict): Config dict for dumping the dataset files.
+        dataset_name (str): Name of the dataset.
         nproc (int): Number of processes to process the data.
         delete (Optional[List]): A list of files to be deleted after
             conversion. Defaults to ['annotations'].
@@ -283,6 +333,7 @@ def __init__(self,
                  gatherer: Dict,
                  parser: Dict,
                  dumper: Dict,
+                 dataset_name: str,
                  nproc: int,
                  delete: List = ['annotations']) -> None:
         super().__init__(
@@ -291,6 +342,7 @@ def __init__(self,
             gatherer=gatherer,
             parser=parser,
             dumper=dumper,
+            dataset_name=dataset_name,
             nproc=nproc,
             delete=delete,
             task='textspotting')
@@ -368,6 +420,7 @@ class TextRecogDataConverter(BaseDataConverter):
         gatherer (Dict): Config dict for gathering the dataset files.
         parser (Dict): Config dict for parsing the dataset annotations.
         dumper (Dict): Config dict for dumping the dataset files.
+        dataset_name (str): Name of the dataset.
         nproc (int): Number of processes to process the data.
         delete (Optional[List]): A list of files to be deleted after
             conversion. Defaults to ['annotations].
@@ -379,6 +432,7 @@ def __init__(self,
                  gatherer: Dict,
                  parser: Dict,
                  dumper: Dict,
+                 dataset_name: str,
                  nproc: int,
                  delete: List = ['annotations']):
         super().__init__(
@@ -387,6 +441,7 @@ def __init__(self,
             gatherer=gatherer,
             parser=parser,
             dumper=dumper,
+            dataset_name=dataset_name,
             nproc=nproc,
             task='textrecog',
             delete=delete)
@@ -436,6 +491,7 @@ class TextRecogCropConverter(TextRecogDataConverter):
         gatherer (Dict): Config dict for gathering the dataset files.
         parser (Dict): Config dict for parsing the dataset annotations.
         dumper (Dict): Config dict for dumping the dataset files.
+        dataset_name (str): Name of the dataset.
         nproc (int): Number of processes to process the data.
         long_edge_pad_ratio (float): The ratio of padding the long edge of the
             cropped image. Defaults to 0.1.
@@ -451,6 +507,7 @@ def __init__(self,
                  gatherer: Dict,
                  parser: Dict,
                  dumper: Dict,
+                 dataset_name: str,
                  nproc: int,
                  long_edge_pad_ratio: float = 0.1,
                  short_edge_pad_ratio: float = 0.05,
@@ -461,6 +518,7 @@ def __init__(self,
             gatherer=gatherer,
             parser=parser,
             dumper=dumper,
+            dataset_name=dataset_name,
             nproc=nproc,
             delete=delete)
         self.ignore = self.parser.ignore
@@ -540,6 +598,7 @@ def __init__(self,
                  gatherer: Dict,
                  parser: Dict,
                  dumper: Dict,
+                 dataset_name: str,
                  nproc: int,
                  delete: Optional[List] = None,
                  merge_bg_others: bool = False,
@@ -555,6 +614,7 @@ def __init__(self,
             gatherer=gatherer,
             parser=parser,
             dumper=dumper,
+            dataset_name=dataset_name,
             nproc=nproc,
             task='kie',
             delete=delete)

diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py
@@ -24,6 +24,7 @@ class NaiveDataObtainer:
         files (list[dict]): A list of file information.
         cache_path (str): The path to cache the downloaded files.
         data_root (str): The root path of the dataset.
+        task (str): The task of the dataset.
     """
 
     def __init__(self, files: List[Dict], cache_path: str, data_root: str,

diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py
@@ -49,6 +49,7 @@ def __init__(self,
         cfg_path = osp.join(cfg_path, dataset_name)
         self.nproc = nproc
         self.task = task
+        self.dataset_name = dataset_name
         self.parse_meta(cfg_path)
         self.parse_cfg(cfg_path)
 
@@ -104,7 +105,8 @@ def parse_cfg(self, cfg_path: str) -> None:
             cfg.data_obtainer.update(task=self.task)
             self.data_obtainer = DATA_OBTAINERS.build(cfg.data_obtainer)
         if 'data_converter' in cfg:
-            cfg.data_converter.update(dict(nproc=self.nproc))
+            cfg.data_converter.update(
+                dict(nproc=self.nproc, dataset_name=self.dataset_name))
             self.data_converter = DATA_CONVERTERS.build(cfg.data_converter)
 
     @property

diff --git a/mmocr/datasets/preparers/dumpers/dumpers.py b/mmocr/datasets/preparers/dumpers/dumpers.py
@@ -11,29 +11,55 @@
 @DATA_DUMPERS.register_module()
 class JsonDumper:
 
-    def __init__(self, task: str) -> None:
+    def __init__(self, task: str, dataset_name: str) -> None:
         self.task = task
+        self.dataset_name = dataset_name
 
-    def dump(self, data: Dict, data_root: str, split: str) -> None:
+    def dump(self, data: Dict, data_root: str, split: str) -> str:
         """Dump data to json file.
 
         Args:
             data (Dict): Data to be dumped.
             data_root (str): Root directory of data.
             split (str): Split of data.
+            cfg_path (str): Path to configs. Defaults to 'configs/'.
+
+        Returns:
+            str: String of dataset config.
+
+        Examples:
+        The returned dataset config
+        >>> ic15_rec_train = dict(
+        >>>     type='OCRDataset',
+        >>>     data_root=ic15_rec_data_root,
+        >>>     ann_file='textrecog_train.json',
+        >>>     test_mode=False,
+        >>>     pipeline=None)
         """
 
         dst_file = osp.join(data_root, f'{self.task}_{split}.json')
         mmengine.dump(data, dst_file)
 
+        cfg = f'\n{self.dataset_name}_{self.task}_{split} = dict (\n'
+        cfg += '    type=\'OCRDataset\',\n'
+        cfg += '    data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n'  # noqa: E501
+        cfg += f'    ann_file=\'{osp.basename(dst_file)}\',\n'
+        if split == 'train' and self.task == 'textdet':
+            cfg += '    filter_cfg=dict(filter_empty_gt=True, min_size=32),\n'
+        elif split in ['test', 'val']:
+            cfg += '    test_mode=True,\n'
+        cfg += '    pipeline=None)\n'
+
+        return cfg
+
 
 @DATA_DUMPERS.register_module()
 class WildreceiptOpensetDumper:
 
     def __init__(self, task: str) -> None:
         self.task = task
 
-    def dump(self, data: List, data_root: str, split: str) -> None:
+    def dump(self, data: List, data_root: str, split: str) -> str:
         """Dump data to txt file.
 
         Args:
@@ -43,3 +69,5 @@ def dump(self, data: List, data_root: str, split: str) -> None:
         """
 
         list_to_file(osp.join(data_root, f'openset_{split}.txt'), data)
+
+        return None
diff --git a/mmocr/datasets/preparers/parsers/coco_parser.py b/mmocr/datasets/preparers/parsers/coco_parser.py
@@ -12,11 +12,12 @@ class COCOTextDetAnnParser(BaseParser):
     """COCO Text Detection Parser.
 
     Args:
+        data_root (str): The root path of the dataset. Defaults to None.
         nproc (int): The number of processes to parse the annotation. Defaults
             to 1.
     """
 
-    def __init__(self, data_root=None, nproc: int = 1) -> None:
+    def __init__(self, data_root: str = None, nproc: int = 1) -> None:
 
         super().__init__(nproc=nproc, data_root=data_root)
 

diff --git a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py
@@ -23,7 +23,7 @@ class ICDARTxtTextDetAnnParser(BaseParser):
             'utf-8-sig'.
         nproc (int): The number of processes to parse the annotation. Defaults
             to 1.
-        remove_flag (List[str], Optional): Used to remove redundant strings in
+        remove_strs (List[str], Optional): Used to remove redundant strings in
             the transcription. Defaults to None.
     """
 
@@ -33,12 +33,12 @@ def __init__(self,
                  format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
                  encoding: str = 'utf-8-sig',
                  nproc: int = 1,
-                 remove_flag: Optional[List[str]] = None) -> None:
+                 remove_strs: Optional[List[str]] = None) -> None:
         self.sep = separator
         self.format = format
         self.encoding = encoding
         self.ignore = ignore
-        self.remove_flag = remove_flag
+        self.remove_strs = remove_strs
         super().__init__(nproc=nproc)
 
     def parse_file(self, file: Tuple, split: str) -> Tuple:
@@ -48,11 +48,13 @@ def parse_file(self, file: Tuple, split: str) -> Tuple:
         for anno in self.loader(txt_file, self.sep, self.format,
                                 self.encoding):
             anno = list(anno.values())
+            if self.remove_strs is not None:
+                for flag in self.remove_strs:
+                    for i in range(len(anno)):
+                        if flag in anno[i]:
+                            anno[i] = anno[i].replace(flag, '')
             poly = list(map(float, anno[0:-1]))
             text = anno[-1]
-            if self.remove_flag is not None:
-                for flag in self.remove_flag:
-                    text = text.replace(flag, '')
             instances.append(
                 dict(poly=poly, text=text, ignore=text == self.ignore))
 

diff --git a/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py
@@ -21,7 +21,7 @@ def test_json_dumpers(self):
                 task_name='textdet',
                 category=[dict(id=0, name='text')]))
 
-        dumper = JsonDumper(task)
+        dumper = JsonDumper(task, dataset_name='test')
         dumper.dump(fake_data, self.root.name, split)
         with open(osp.join(self.root.name, f'{task}_{split}.json'), 'r') as f:
             data = json.load(f)