# IAM Dataset

_Published_ -> The database was first published in at the ICDAR 1999.  


About dataset
------------------
The database contains forms of unconstrained handwritten text, which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels. 

The IAM Handwriting Database 3.0 is structured as follows:
- 657 writers contributed samples of their handwriting
- 1,539 pages of scanned text
- 5,685 isolated and labeled sentences
- 13,353 isolated and labeled text lines
- 115,320 isolated and labeled words

The words have been extracted from pages of scanned text using an automatic segmentation scheme and were verified manually. 
- Paper name - Automatic Segmentation of the IAM Off-line Database for Handwritten English Text
- Authors Matthias Zimmermann, Horst Bunke
- Link - http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.652.1885&rep=rep1&type=pdf


File format
======

- __form.txt__
  - format: a01-000u 000 2 prt 7 5 52 36
  - a01-000u  -> form id
  - 000       -> writer id
  - 2         -> number of sentences
  - prt       -> word segmentation
    - prt: some lines correctly segmented
    - all: all lines correctly segmented
  - 7 5       -> 5 of 7 lines are correctly segmented into words
  - 52 36     -> the form contains 52 words, 36 are in lines which have been correctly segmented

# Reading dataset

In [None]:
# import
import os
import glob
import shutil
import numpy as np
from PIL import Image
from itertools import islice
from collections import defaultdict
from matplotlib import pyplot as plt
%matplotlib inline

Created a dictionary from forms.txt. To store writer and their forms information

In [None]:
# Create a dictionary to store each writer and its form
writer_form = defaultdict(list)
forms_file_path = "D:\\dataset\\IAM\\forms.txt"
with open(forms_file_path) as f:
    for line in islice(f, 16, None):
        line_list = line.split(' ')
        form_id = line_list[0]
        writer = line_list[1]
        writer_form[writer].append(form_id)
list(writer_form.items())[:5]

In [None]:
#print writer and its no of forms
print("Writer id \t No. of form")
no_of_form_no_of_writer = defaultdict(int)
for key, value in sorted(writer_form.items(), key= lambda kv : len(kv[1]),reverse= True):
    print(f"{key}\t\t\t{len(value)}")
    no_of_form_no_of_writer[len(value)] += 1

In [None]:
#no_of_form - no_of_writer
print("No. of form \t No. of Writer")
for key, value in sorted(no_of_form_no_of_writer.items()):
    print(f"{key}\t\t\t{value}")

As we can observe here, more than half of the number of writers have written only 1 form. The distribution of number of form per writer is unequal. So this could be a challenge because most of the writers have not written many forms, so we have less data to train our model for such writers.

# Extracting data

__Dataset available in forms, sentences, lines, words format. For all type of data the form_id and writer_id is same.__

In [None]:
#Extract all word image of a writer to destination folder
def getWriterWordData( writer_id, writer_form_dict, source_path, dest_path):
    '''Extract all image written by author to a folder.'''
    if(not os.path.exists(dest_path)):
            os.mkdir(dest_path)
    writer_id = str(writer_id)
    
    if(len(writer_form_dict[writer_id]) == 0):
        print("Invalid Writer id")
        return False
    else:
        dest_fol_path = os.path.join(dest_path,writer_id)
        
        if(not os.path.exists(dest_fol_path)):
            os.mkdir(dest_fol_path)
            
        fol_list = writer_form_dict[writer_id]
        for fol in fol_list:
            fol_name_split = fol.split("-")
            parent_fol = fol_name_split[0]
            parent_fol_path = os.path.join(source_path,parent_fol)
            fol_path = os.path.join(parent_fol_path,fol)
            files = os.listdir(fol_path)
            for f in files:
                shutil.copy(fol_path+'\\'+f, dest_fol_path+'\\'+f)
        print("Extracted successfully writer ",writer_id)
        return True
    
#Extract all form image of a writer to destination folder
def getWriterFormData( writer_id, writer_form_dict, source_path, dest_path):
    '''Extract all image written by author to a folder.'''
    if(not os.path.exists(dest_path)):
            os.mkdir(dest_path)
    writer_id = str(writer_id)
    
    if(len(writer_form_dict[writer_id]) == 0):
        print("Invalid Writer id")
        return False
    else:
        dest_fol_path = os.path.join(dest_path,writer_id)
        
        if(not os.path.exists(dest_fol_path)):
            os.mkdir(dest_fol_path)
            
        form_list = writer_form_dict[writer_id]
        for form in form_list:
            form_img = form + ".png"
            form_img_path = os.path.join(source_path,form_img)
            shutil.copy(form_img_path, dest_fol_path + "\\" +form_img )
        print("Extracted successfully writer ",writer_id)
        return True

In [None]:
# extracting data 
sourcepath = 'D:\\dataset\\IAM\\forms'
destpath = 'D:\\dataset\\exp\\forms_10'
wid_list = [150,151,152,153,154,384,551,552,588,635,670,671]
for wid in wid_list:
    getWriterFormData(wid, writer_form, sourcepath, destpath)

## Compression
__The images in dataset are of unequal heights. So we compress the image and change the height of the image to a fixed pixel. The width of image is changed accordingly so that the aspect ratio of the image doesn't change.__

In [None]:
# compress func
import os
from PIL import Image 

def compress(img_path, new_height, comp_img_path):
    '''
    Input : img_path, new_height, comp_img_path
    Output : new file path
    '''
    if not os.path.exists(comp_img_path):
        os.makedirs(comp_img_path)
    fname = img_path.split("\\")[-1]
    img = Image.open(img_path)
    hpercent = (new_height / float(img.size[1]))
    wsize = int((float(img.size[0]) * float(hpercent)))
    img = img.resize((wsize, new_height), Image.ANTIALIAS)
    img.save(os.path.join(comp_img_path, fname))
    return os.path.join(comp_img_path, fname)

In [None]:
# compressing images
# paths = glob.glob("D:\\dataset\\exp\\all\\*\\*.png")
# for path in paths:
#     l = path.split("\\")
#     new_path = "D:\\dataset\\exp\\all_comp_128\\" + l[4]
#     compress(path,128,new_path)

## Removing unwanted data

In [None]:
# Binarization function
def binarize(x):
    if(x > 200):
        return 1
    else:
        return 0
vect_binarize = np.vectorize(binarize)

In [None]:
# removing unwanted data
del_files_path_list = list()
count = 0
for path in glob.glob("D:\\dataset\\exp\\all_comp_128\\*\\*.png"):
    im = Image.open(path)
    im_np = np.array(im)
    im_np = vect_binarize(im_np)
    ratio = np.sum(im_np)/np.size(im_np)
    if(ratio < 0.6):
        shutil.copy(path,"D:\\dataset\\exp\\temp")
#         os.remove(path)
        del_files_path_list.append(path)
        count+=1
        print(count,end=" ")

In [None]:
# removing unwanted data
delete_it_list = list()
for path in glob.glob("D:\\dataset\\exp\\temp\\*.png"):
    delete_it_list.append(path.split("\\")[4])

count = 0
for path in glob.glob("D:\\dataset\\exp\\all_comp_128\\*\\*.png"):
    fname = path.split("\\")[5]
    if(fname in delete_it_list):
        shutil.copy(path,"D:\\dataset\\exp\\temp2")
        os.remove(path)
        count+=1
        print(count,end=" ")