Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Add Dataset Preparer #1484

Merged
merged 20 commits into from
Nov 2, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dataset_zoo/wildreceipt/metafile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Paper:
}
'
Data:
Website: https://github.com/cs-chan/Total-Text-Dataset
Website: https://download.openmmlab.com/mmocr/data/wildreceipt.tar
Language:
- English
Scene:
Expand Down
66 changes: 63 additions & 3 deletions mmocr/datasets/preparers/data_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class BaseDataConverter:
dumper (Dict): Config dict for dumping the dataset files.
nproc (int): Number of processes to process the data.
task (str): Task of the dataset.
dataset_name (str): Dataset name.
delete (Optional[List]): A list of files to be deleted after
conversion.
"""
Expand All @@ -37,17 +38,21 @@ def __init__(self,
dumper: Dict,
nproc: int,
task: str,
delete: Optional[List] = None):
dataset_name: str,
delete: Optional[List] = None,
config_path: str = 'configs/'):
xinke-wang marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(nproc, int) and nproc > 0, \
'nproc must be a positive integer.'
self.splits = splits
self.data_root = data_root
self.nproc = nproc
self.task = task
self.dataset_name = dataset_name
self.delete = delete
self.config_path = config_path
self.img_dir = f'{task}_imgs'
parser.update(dict(nproc=nproc))
dumper.update(dict(task=task))
dumper.update(dict(task=task, dataset_name=dataset_name))
self.parser = DATA_PARSERS.build(parser)
self.dumper = DATA_DUMPERS.build(dumper)
gather_type = gatherer.pop('type')
Expand All @@ -62,6 +67,7 @@ def __init__(self,
def __call__(self):
"""Process the data."""
# Convert and dump annotations to MMOCR format
dataset_config = dict()
for split in self.splits:
print(f'Parsing {split} split...')
# Gather the info such as file names required by parser
Expand All @@ -78,9 +84,49 @@ def __call__(self):
samples = track_parallel_progress(func, samples, nproc=self.nproc)
samples = self.add_meta(samples)
# Dump annotation files
self.dumper.dump(samples, self.data_root, split)
dataset_config[split] = self.dumper.dump(samples, self.data_root,
split)
self.generate_dataset_config(dataset_config)
self.clean()

def generate_dataset_config(self, dataset_config: Dict) -> None:
"""Generate dataset config file. Dataset config is a python file that
contains the dataset information.

Examples:
Generated dataset config
>>> ic15_rec_data_root = 'data/icdar2015/'
>>> ic15_rec_train = dict(
>>> type='OCRDataset',
>>> data_root=ic15_rec_data_root,
>>> ann_file='textrecog_train.json',
>>> test_mode=False,
>>> pipeline=None)
>>> ic15_rec_test = dict(
>>> type='OCRDataset',
>>> data_root=ic15_rec_data_root,
>>> ann_file='textrecog_test.json',
>>> test_mode=True,
>>> pipeline=None)

Args:
dataset_config (Dict): A dict contains the dataset config string of
each split.
"""
if self.task == 'kie':
# Not supported yet
return
cfg_path = osp.join(self.config_path, self.task, '_base_', 'datasets',
f'{self.dataset_name}.py')
if not osp.exists(cfg_path):
with open(cfg_path, 'w') as f:
f.write(
f'{self.dataset_name}_{self.task}_data_root = \'{self.data_root}\'\n' # noqa: E501
)
for split in self.splits:
with open(cfg_path, 'a') as f:
f.write(dataset_config[split])

@abstractmethod
def pack_instance(self, sample: Tuple, split: str) -> Dict:
"""Pack the parsed annotation info to an MMOCR format instance.
Expand Down Expand Up @@ -178,6 +224,7 @@ class TextDetDataConverter(BaseDataConverter):
gatherer (Dict): Config dict for gathering the dataset files.
parser (Dict): Config dict for parsing the dataset files.
dumper (Dict): Config dict for dumping the dataset files.
dataset_name (str): Name of the dataset.
nproc (int): Number of processes to process the data.
delete (Optional[List]): A list of files to be deleted after
conversion. Defaults to ['annotations].
Expand All @@ -189,6 +236,7 @@ def __init__(self,
gatherer: Dict,
parser: Dict,
dumper: Dict,
dataset_name: str,
nproc: int,
delete: List = ['annotations']) -> None:
super().__init__(
Expand All @@ -197,6 +245,7 @@ def __init__(self,
gatherer=gatherer,
parser=parser,
dumper=dumper,
dataset_name=dataset_name,
nproc=nproc,
delete=delete,
task='textdet')
Expand Down Expand Up @@ -272,6 +321,7 @@ class TextSpottingDataConverter(BaseDataConverter):
gatherer (Dict): Config dict for gathering the dataset files.
parser (Dict): Config dict for parsing the dataset files.
dumper (Dict): Config dict for dumping the dataset files.
dataset_name (str): Name of the dataset.
nproc (int): Number of processes to process the data.
delete (Optional[List]): A list of files to be deleted after
conversion. Defaults to ['annotations'].
Expand All @@ -283,6 +333,7 @@ def __init__(self,
gatherer: Dict,
parser: Dict,
dumper: Dict,
dataset_name: str,
nproc: int,
delete: List = ['annotations']) -> None:
super().__init__(
Expand All @@ -291,6 +342,7 @@ def __init__(self,
gatherer=gatherer,
parser=parser,
dumper=dumper,
dataset_name=dataset_name,
nproc=nproc,
delete=delete,
task='textspotting')
Expand Down Expand Up @@ -368,6 +420,7 @@ class TextRecogDataConverter(BaseDataConverter):
gatherer (Dict): Config dict for gathering the dataset files.
parser (Dict): Config dict for parsing the dataset annotations.
dumper (Dict): Config dict for dumping the dataset files.
dataset_name (str): Name of the dataset.
nproc (int): Number of processes to process the data.
delete (Optional[List]): A list of files to be deleted after
conversion. Defaults to ['annotations].
Expand All @@ -379,6 +432,7 @@ def __init__(self,
gatherer: Dict,
parser: Dict,
dumper: Dict,
dataset_name: str,
nproc: int,
delete: List = ['annotations']):
super().__init__(
Expand All @@ -387,6 +441,7 @@ def __init__(self,
gatherer=gatherer,
parser=parser,
dumper=dumper,
dataset_name=dataset_name,
nproc=nproc,
task='textrecog',
delete=delete)
Expand Down Expand Up @@ -436,6 +491,7 @@ class TextRecogCropConverter(TextRecogDataConverter):
gatherer (Dict): Config dict for gathering the dataset files.
parser (Dict): Config dict for parsing the dataset annotations.
dumper (Dict): Config dict for dumping the dataset files.
dataset_name (str): Name of the dataset.
nproc (int): Number of processes to process the data.
long_edge_pad_ratio (float): The ratio of padding the long edge of the
cropped image. Defaults to 0.1.
Expand All @@ -451,6 +507,7 @@ def __init__(self,
gatherer: Dict,
parser: Dict,
dumper: Dict,
dataset_name: str,
nproc: int,
long_edge_pad_ratio: float = 0.1,
short_edge_pad_ratio: float = 0.05,
Expand All @@ -461,6 +518,7 @@ def __init__(self,
gatherer=gatherer,
parser=parser,
dumper=dumper,
dataset_name=dataset_name,
nproc=nproc,
delete=delete)
self.ignore = self.parser.ignore
Expand Down Expand Up @@ -540,6 +598,7 @@ def __init__(self,
gatherer: Dict,
parser: Dict,
dumper: Dict,
dataset_name: str,
nproc: int,
delete: Optional[List] = None,
merge_bg_others: bool = False,
Expand All @@ -555,6 +614,7 @@ def __init__(self,
gatherer=gatherer,
parser=parser,
dumper=dumper,
dataset_name=dataset_name,
nproc=nproc,
task='kie',
delete=delete)
Expand Down
1 change: 1 addition & 0 deletions mmocr/datasets/preparers/data_obtainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class NaiveDataObtainer:
files (list[dict]): A list of file information.
cache_path (str): The path to cache the downloaded files.
data_root (str): The root path of the dataset.
task (str): The task of the dataset.
"""

def __init__(self, files: List[Dict], cache_path: str, data_root: str,
Expand Down
4 changes: 3 additions & 1 deletion mmocr/datasets/preparers/data_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(self,
cfg_path = osp.join(cfg_path, dataset_name)
self.nproc = nproc
self.task = task
self.dataset_name = dataset_name
self.parse_meta(cfg_path)
self.parse_cfg(cfg_path)

Expand Down Expand Up @@ -104,7 +105,8 @@ def parse_cfg(self, cfg_path: str) -> None:
cfg.data_obtainer.update(task=self.task)
self.data_obtainer = DATA_OBTAINERS.build(cfg.data_obtainer)
if 'data_converter' in cfg:
cfg.data_converter.update(dict(nproc=self.nproc))
cfg.data_converter.update(
dict(nproc=self.nproc, dataset_name=self.dataset_name))
self.data_converter = DATA_CONVERTERS.build(cfg.data_converter)

@property
Expand Down
34 changes: 31 additions & 3 deletions mmocr/datasets/preparers/dumpers/dumpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,55 @@
@DATA_DUMPERS.register_module()
class JsonDumper:

def __init__(self, task: str) -> None:
def __init__(self, task: str, dataset_name: str) -> None:
self.task = task
self.dataset_name = dataset_name

def dump(self, data: Dict, data_root: str, split: str) -> None:
def dump(self, data: Dict, data_root: str, split: str) -> str:
"""Dump data to json file.

Args:
data (Dict): Data to be dumped.
data_root (str): Root directory of data.
split (str): Split of data.
cfg_path (str): Path to configs. Defaults to 'configs/'.

Returns:
str: String of dataset config.

Examples:
The returned dataset config
>>> ic15_rec_train = dict(
>>> type='OCRDataset',
>>> data_root=ic15_rec_data_root,
>>> ann_file='textrecog_train.json',
>>> test_mode=False,
>>> pipeline=None)
"""

dst_file = osp.join(data_root, f'{self.task}_{split}.json')
mmengine.dump(data, dst_file)

cfg = f'\n{self.dataset_name}_{self.task}_{split} = dict (\n'
cfg += ' type=\'OCRDataset\',\n'
cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501
cfg += f' ann_file=\'{osp.basename(dst_file)}\',\n'
if split == 'train' and self.task == 'textdet':
cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n'
elif split in ['test', 'val']:
cfg += ' test_mode=True,\n'
cfg += ' pipeline=None)\n'

return cfg


@DATA_DUMPERS.register_module()
class WildreceiptOpensetDumper:

def __init__(self, task: str) -> None:
self.task = task

def dump(self, data: List, data_root: str, split: str) -> None:
def dump(self, data: List, data_root: str, split: str) -> str:
"""Dump data to txt file.

Args:
Expand All @@ -43,3 +69,5 @@ def dump(self, data: List, data_root: str, split: str) -> None:
"""

list_to_file(osp.join(data_root, f'openset_{split}.txt'), data)

return None
3 changes: 2 additions & 1 deletion mmocr/datasets/preparers/parsers/coco_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ class COCOTextDetAnnParser(BaseParser):
"""COCO Text Detection Parser.

Args:
xinke-wang marked this conversation as resolved.
Show resolved Hide resolved
data_root (str): The root path of the dataset. Defaults to None.
nproc (int): The number of processes to parse the annotation. Defaults
to 1.
"""

def __init__(self, data_root=None, nproc: int = 1) -> None:
def __init__(self, data_root: str = None, nproc: int = 1) -> None:

super().__init__(nproc=nproc, data_root=data_root)

Expand Down
14 changes: 8 additions & 6 deletions mmocr/datasets/preparers/parsers/icdar_txt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class ICDARTxtTextDetAnnParser(BaseParser):
'utf-8-sig'.
nproc (int): The number of processes to parse the annotation. Defaults
to 1.
remove_flag (List[str], Optional): Used to remove redundant strings in
remove_strs (List[str], Optional): Used to remove redundant strings in
the transcription. Defaults to None.
"""

Expand All @@ -33,12 +33,12 @@ def __init__(self,
format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
encoding: str = 'utf-8-sig',
nproc: int = 1,
remove_flag: Optional[List[str]] = None) -> None:
remove_strs: Optional[List[str]] = None) -> None:
self.sep = separator
self.format = format
self.encoding = encoding
self.ignore = ignore
self.remove_flag = remove_flag
self.remove_strs = remove_strs
super().__init__(nproc=nproc)

def parse_file(self, file: Tuple, split: str) -> Tuple:
Expand All @@ -48,11 +48,13 @@ def parse_file(self, file: Tuple, split: str) -> Tuple:
for anno in self.loader(txt_file, self.sep, self.format,
self.encoding):
anno = list(anno.values())
if self.remove_strs is not None:
for flag in self.remove_strs:
for i in range(len(anno)):
if flag in anno[i]:
anno[i] = anno[i].replace(flag, '')
poly = list(map(float, anno[0:-1]))
text = anno[-1]
if self.remove_flag is not None:
for flag in self.remove_flag:
text = text.replace(flag, '')
instances.append(
dict(poly=poly, text=text, ignore=text == self.ignore))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_json_dumpers(self):
task_name='textdet',
category=[dict(id=0, name='text')]))

dumper = JsonDumper(task)
dumper = JsonDumper(task, dataset_name='test')
dumper.dump(fake_data, self.root.name, split)
with open(osp.join(self.root.name, f'{task}_{split}.json'), 'r') as f:
data = json.load(f)
Expand Down