<a href="https://colab.research.google.com/github/nihal0619/Project-3200/blob/main/Project_3200.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget -q https://git.io/J0fjL -O IAM_Words.zip
!unzip -qq IAM_Words.zip
!mkdir data
!mkdir data/words
!tar -xf IAM_Words/words.tgz -C data/words
!mv IAM_Words/words.txt data


In [2]:
!head -30 data/words.txt

#--- words.txt ---------------------------------------------------------------#
#
# iam database word information
#
# format: a01-000u-00-00 ok 154 1 408 768 27 51 AT A
#
#     a01-000u-00-00  -> word id for line 00 in form a01-000u
#     ok              -> result of word segmentation
#                            ok: word was correctly
#                            er: segmentation of word can be bad
#
#     154             -> graylevel to binarize the line containing this word
#     1               -> number of components for this word
#     408 768 27 51   -> bounding box around this word in x,y,w,h format
#     AT              -> the grammatical tag for this word, see the
#                        file tagset.txt for an explanation
#     A               -> the transcription for this word
#
a01-000u-00-00 ok 154 408 768 27 51 AT A
a01-000u-00-01 ok 154 507 766 213 48 NN MOVE
a01-000u-00-02 ok 154 796 764 70 50 TO to
a01-000u-00-03 ok 154 919 757 166 78 VB stop
a01-000u-00-04 ok 154 118

In [3]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow import keras

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os

np.random.seed(42)
tf.random.set_seed(42)

In [4]:
base_path ="data"
words_list = []

words = open(f"{base_path}/words.txt", "r").readlines()
for line in words:
     if line[0] == "#" :
       continue

     if line.split(" ")[1] != "err" :
       words_list.append(line)

len (words_list)
np.random.shuffle(words_list)      

In [5]:
split_idx = int(0.9 * len(words_list))
train_samples = words_list[:split_idx]
test_samples = words_list [split_idx:]

val_split_idx = int (0.5 *len(test_samples))
validation_samples =test_samples [:val_split_idx]
test_samples = test_samples[val_split_idx:]

assert len(words_list) == len(train_samples) + len(validation_samples) + len(test_samples)

print(f"total training samples: {len(train_samples)}")
print(f"total validation samples : {len(validation_samples)}")
print(f"total test samples: {len(test_samples)}")


total training samples: 86810
total validation samples : 4823
total test samples: 4823


In [6]:
base_image_path = os.path.join(base_path, "words")

def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []
    for (i, file_line) in enumerate(samples):
        line_split = file_line.strip()
        line_split = line_split.split(" ") 
        #each line split will have the following format
        #part1/part1.part2/part1.part2.part3.png
        image_name = line_split[0]
        partI = image_name.split("-")[0]
        partII = image_name.split("-")[1]
        img_path = os.path.join (
            base_image_path, partI, partI + "-" + partII, image_name + ".png"
        )
        if os.path.getsize( img_path):
          paths.append(img_path)
          corrected_samples.append(file_line.split("\n")[0])
     
    return paths, corrected_samples    


train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)     
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)
 
      

In [7]:
#Determne the max length and the size of the vocabulary in the trainning data.
train_labels_cleaned = []
characters = set ()
max_len = 0

for label in train_labels:
  label = label.split(" ")[-1].strip()
  for char in label:
    characters.add(char)

  max_len = max(max_len, len(label))
  train_labels_cleaned.append(label)

print("max length " , max_len)
print("Vocab size " , len(characters))

#check some label samples
train_labels_cleaned[0:10]

max length  21
Vocab size  78


['sure',
 'he',
 'during',
 'of',
 'booty',
 'gastronomy',
 'boy',
 'The',
 'and',
 'in']

In [8]:

def clean_lables(labels):
    cleaned_labels = []
    for label in labels:
      label = label.split(" ")[-1].strip()
      cleaned_labels.append(label)
    return cleaned_labels


validation_labels_cleaned = clean_lables(validation_labels)
test_labels_cleaned = clean_lables(test_labels)

In [9]:

AUTOTUNE = tf.data.AUTOTUNE

# map characters into integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)

# map integers back to original characters 
num_to_char = StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)