# IAM Dataset

_Published_ -> The database was first published in at the ICDAR 1999.  


About dataset
------------------
The database contains forms of unconstrained handwritten text, which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels. 

The IAM Handwriting Database 3.0 is structured as follows:
- 657 writers contributed samples of their handwriting
- 1'539 pages of scanned text
- 5'685 isolated and labeled sentences
- 13'353 isolated and labeled text lines
- 115'320 isolated and labeled words

The words have been extracted from pages of scanned text using an automatic segmentation scheme and were verified manually. 
- Paper name - Automatic Segmentation of the IAM Off-line Database for Handwritten English Text
- Authors Matthias Zimmermann, Horst Bunke
- Link - http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.652.1885&rep=rep1&type=pdf



Reading dataset
----------------
- __form.txt__
  - format: a01-000u 000 2 prt 7 5 52 36
  - a01-000u  -> form id
  - 000       -> writer id
  - 2         -> number of sentences
  - prt       -> word segmentation
    - prt: some lines correctly segmented
    - all: all lines correctly segmented
  - 7 5       -> 5 of 7 lines are correctly segmented into words
  - 52 36     -> the form contains 52 words, 36 are in lines which have been correctly segmented

In [None]:
# import
import os
import glob
import shutil
from itertools import islice
from collections import defaultdict

In [None]:
# Create a dictionary to store each writer and its form
writer_form = defaultdict(list)
forms_file_path = "D:\\dataset\\IAM\\forms.txt"
with open(forms_file_path) as f:
    for line in islice(f, 16, None):
        line_list = line.split(' ')
        form_id = line_list[0]
        writer = line_list[1]
        writer_form[writer].append(form_id)
list(writer_form.items())

In [None]:
#print writer and its no of forms
print("Writer id \t No. of form")
no_of_form_no_of_writer = defaultdict(int)
for key, value in sorted(writer_form.items(), key= lambda kv : len(kv[1]),reverse= True):
    print(f"{key}\t\t\t{len(value)}")
    no_of_form_no_of_writer[len(value)] += 1

In [None]:
#no_of_form - no_of_writer
print("No. of form \t No. of Writer")
for key, value in sorted(no_of_form_no_of_writer.items()):
    print(f"{key}\t\t\t{value}")

In [None]:
#function for extracting all image of a writer to one folder
def getWriterData( writer_id, writer_form_dict, source_path, dest_path):
    '''Extract all image written by author to a folder.'''
    if(not os.path.exists(dest_path)):
            os.mkdir(dest_path)
    writer_id = str(writer_id)
    
    if(len(writer_form_dict[writer_id]) == 0):
        print("Invalid Writer id")
        return False
    else:
        dest_fol_path = os.path.join(dest_path,writer_id)
        
        if(not os.path.exists(dest_fol_path)):
            os.mkdir(dest_fol_path)
            
        fol_list = writer_form_dict[writer_id]
        for fol in fol_list:
            fol_name_split = fol.split("-")
            parent_fol = fol_name_split[0]
            parent_fol_path = os.path.join(source_path,parent_fol)
            fol_path = os.path.join(parent_fol_path,fol)
            files = os.listdir(fol_path)
            for f in files:
                shutil.copy(fol_path+'\\'+f, dest_fol_path+'\\'+f)
        print("Extracted successfully writer ",writer_id)
        return True

In [None]:
sourcepath = 'D:\\dataset\\IAM\\words'
destpath = 'D:\\dataset\\exp\\5_10'
wid_list = ['150','151','152','153','154']
for wid in wid_list:
    getWriterData(wid, writer_form, sourcepath, destpath)

In [None]:
paths = glob.glob("D:\\dataset\\exp\\5_10\\*")
for i,path in enumerate(paths):
    print(i,path)
    os.rename(path, "D:\\dataset\\exp\\5_10\\" + str(i).zfill(3))