In [24]:
ROOT = "../../../diploma/data"
SPLIT = "IAM-D"

In [47]:
# we are interested in lines and IAM-D folder only

# see https://github.com/shonenkov/IAM-Splitting for IAM-D split
# see https://fki.tic.heia-fr.ch/databases/iam-handwriting-database for IAM data (not available in Russia)
!ls "{ROOT}/"

[34mIAM-B[m[m         [34marchive[m[m       [31mforms.txt[m[m     [31mlines.txt[m[m     [34msplits[m[m
[34mIAM-D[m[m         [34mforms[m[m         [34mlines[m[m         [31msentences.txt[m[m [31mwords.txt[m[m


In [48]:
!ls "{ROOT}/IAM-D"

test.txt  train.txt valid.txt


In [45]:
import re
import os

In [34]:
# see "https://github.com/shonenkov/IAM-Splitting"
train_ids = [line.strip() for line in open(f'{ROOT}/{SPLIT}/train.txt').readlines()]
valid_ids = [line.strip() for line in open(f'{ROOT}/{SPLIT}/valid.txt').readlines()]
test_ids = [line.strip() for line in open(f'{ROOT}/{SPLIT}/test.txt').readlines()]

unique_train_ids = set(train_ids)
unique_valid_ids = set(valid_ids)
unique_test_ids = set(test_ids)

for x, y in zip([train_ids, valid_ids, test_ids], [unique_train_ids, unique_valid_ids, unique_test_ids]):
    assert len(x) == len(y)


def make_stage(img_id):
    if img_id in unique_train_ids:
        return 'train'
    if img_id in unique_valid_ids:
        return 'valid'
    if img_id in unique_test_ids:
        return 'test'
    return 'unknown'

In [35]:
print([len(x) for x in (train_ids, valid_ids, test_ids)])

[9652, 1840, 1861]


In [37]:
dataset = []

for line in open(f'{ROOT}/lines.txt').readlines():
    line = line.strip()
    if line.startswith('#'):
        continue
        
    sample_id, text = re.findall(
        r'([\S]+)\s\w+\s\d+\s\d+\s\d+\s\d+\s\d+\s\d+\s([\w\W]+)',
        line
    )[0]
    folder, subfolder, _ = sample_id.split('-')
    subfolder = f'{folder}-{subfolder}'
    
    dataset.append({
        'sample_id': sample_id,
        'text': text.replace('|', ' ').strip(),
        'path': f'iam/images/{folder}/{subfolder}/{sample_id}.png',
        'stage': make_stage(sample_id),
    })

In [40]:
import pandas as pd

In [42]:
marking = pd.DataFrame(dataset).set_index('sample_id')
marking.head(3)

Unnamed: 0_level_0,text,path,stage
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a01-000u-00,A MOVE to stop Mr. Gaitskell from,iam/images/a01/a01-000u/a01-000u-00.png,train
a01-000u-01,nominating any more Labour life Peers,iam/images/a01/a01-000u/a01-000u-01.png,train
a01-000u-02,is to be made at a meeting of Labour,iam/images/a01/a01-000u/a01-000u-02.png,train


In [43]:
marking['stage'].value_counts()

train    9652
test     1861
valid    1840
Name: stage, dtype: int64

In [46]:
os.makedirs('./prepared_datasets/iam', exist_ok=True)
os.makedirs('./prepared_datasets/iam/images', exist_ok=True)

In [50]:
%%time

!cp -r "{ROOT}/lines/" ./prepared_datasets/iam/images/

CPU times: user 45.9 ms, sys: 29.4 ms, total: 75.3 ms
Wall time: 9.98 s


In [52]:
marking.to_csv('./prepared_datasets/iam/marking.csv')

In [55]:
!cd ./prepared_datasets/iam && ls

[34mimages[m[m      marking.csv


In [None]:
!cd ./prepared_datasets && tar -czvf iam.tar.gz iam > /dev/null

In [67]:
!du -h ./prepared_datasets/iam.tar.gz

657M	./prepared_datasets/iam.tar.gz
