# Generate labels from Transkribus data

This file creates the label files from the original scans and xml files. 
Two parameters need to be specified:

```
cfg_input_folder: folder where the Transkribus scans are located
cfg_output_folder: folder where the output will be saved
```

The program assumes the following file structure:
```
cfg_input_folder/
|-- folder 1
|   |-- page
|   |   | image1.xml
|   |   | image2.xml
|   |   | ...
|   | image1.png
|   | image2.png
|   | ...
|-- folder 2
| ...
```

In [1]:
import os, sys
sys.path.append('C:\\Users\\matthias\\Documents\\myProjects\\baseline_segmentation')

import shutil
from tqdm import tqdm_notebook as tqdm
from src.utils.generate_segmentation_labels import XMLParserBaselines
from src.utils.utils import load_class_dict
from distutils.dir_util import copy_tree

In [2]:
max_side_length = 1024
thickness = round(2.0*max_side_length/256)-6 #-6 for 1024 #-2 for 512
dot_thickness = round(2.0*max_side_length/256)-5 #-5 for 1024 #-2 for 512
class_file = os.path.join('..', 'data', 'class_files', 'classes_btsb.txt')
classes, colors, color_dict = load_class_dict(class_file)

print('thickness: {}'.format(thickness))
print('dot_thickness: {}'.format(dot_thickness))
print('classes:\n')
for c in classes:
    print('{:20s} {}'.format(c, color_dict[c]))

thickness: 2
dot_thickness: 3
classes:

bg                   [0, 0, 0]
text                 [125, 125, 125]
sp_ep_border         [125, 0, 125]
baselines            [0, 255, 0]
end_points           [0, 0, 255]
start_points         [255, 0, 0]


In [3]:
cfg_input_folder = os.path.join('..', 'data', 'cBAD-ICDAR2019')
cfg_output_folder = os.path.join('..', 'data', 'cBAD_' + str(max_side_length) + '_squared')
cfg_pad = False

In [4]:
if not os.path.isdir(os.path.join(cfg_output_folder)):
    os.mkdir(cfg_output_folder)
shutil.copy(class_file, os.path.join(cfg_output_folder, 'classes_btsb.txt'))

'..\\data\\cBAD_1024_squared\\classes_btsb.txt'

In [5]:
for root, directories, filenames in os.walk(cfg_input_folder):
    if (root.split(os.sep)[-1] == 'page'):
        print('Processing ' + root[:-4])
        for file in tqdm(filenames):
            if not os.path.isdir(os.path.join(cfg_output_folder, root.split(os.sep)[-2])):
                os.mkdir(os.path.join(cfg_output_folder, root.split(os.sep)[-2]))
            xml_parser = XMLParserBaselines(xml_filename=os.path.join(root, file),
                                            input_folder=root[:-4], 
                                            output_folder=os.path.join(cfg_output_folder, root.split(os.sep)[-2]),
                                            size_parameter=max_side_length,
                                            class_file=class_file)
            #xml_parser.scale(max_side_length)
            xml_parser.save_as_mask(pad=cfg_pad, thickness=thickness, dot_thickness=dot_thickness)

Processing ..\data\cBAD-ICDAR2019\eval\


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=755.0), HTML(value='')))


Processing ..\data\cBAD-ICDAR2019\test\


HBox(children=(FloatProgress(value=0.0, max=1511.0), HTML(value='')))


Processing ..\data\cBAD-ICDAR2019\train\


HBox(children=(FloatProgress(value=0.0, max=755.0), HTML(value='')))


