In [1]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tensorflow as tf

np.random.seed(42)
tf.random.set_seed(42)

ModuleNotFoundError: No module named 'tensorflow'

In [3]:
words_file_path = "./words.txt"

In [4]:
with open(words_file_path, 'r') as file:
    words = file.readlines()

words_list = []

for line in words:
  if line[0]=="#":
    continue
  if line.split(" ")[1] != "err":  # we do not need to deal with errored entries.
    words_list.append(line)

print(len(words_list))
print(words_list[:5])

np.random.shuffle(words_list)

96456
['a01-000u-00-00 ok 154 408 768 27 51 AT A\n', 'a01-000u-00-01 ok 154 507 766 213 48 NN MOVE\n', 'a01-000u-00-02 ok 154 796 764 70 50 TO to\n', 'a01-000u-00-03 ok 154 919 757 166 78 VB stop\n', 'a01-000u-00-04 ok 154 1185 754 126 61 NPT Mr.\n']


In [5]:
split_idx = int(0.9 * len(words_list))
train_samples = words_list[:split_idx]
test_samples = words_list[split_idx:]

val_split_idx = int(0.5 * len(test_samples))
validation_samples = test_samples[:val_split_idx]
test_samples = test_samples[val_split_idx:]

assert len(words_list) == len(train_samples) + len(validation_samples) + len(test_samples)

print(f"Total data samples: {len(words_list)}")
print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

Total data samples: 96456
Total training samples: 86810
Total validation samples: 4823
Total test samples: 4823


In [6]:
base_image_path ="./words"

def get_image_paths_and_labels(samples):
  paths = []
  corrected_samples = []
  for(i,file_line) in enumerate(samples):
    line_split = file_line.strip()
    line_split = line_split.split(" ")
    #each file_line is as "a01-000u-00-03 ok 154 919 757 166 78 VB stop"
    # part1/part1-part2/part1-part2-part3.png

    image_name = line_split[0]
    partI = image_name.split("-")[0]
    partII = image_name.split("-")[1]
    img_path = os.path.join(
        base_image_path, partI , partI + "-" + partII, image_name + ".png"
    )


    if os.path.getsize(img_path):
      paths.append(img_path)
      corrected_samples.append(file_line.split("\n")[0])

  return paths, corrected_samples


train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)

In [7]:
print(train_img_paths[0:10])
print(train_labels[0:10])

['./words\\e04\\e04-030\\e04-030-04-08.png', './words\\k02\\k02-102\\k02-102-05-03.png', './words\\a01\\a01-082u\\a01-082u-01-04.png', './words\\m01\\m01-000\\m01-000-07-00.png', './words\\g01\\g01-031\\g01-031-07-06.png', './words\\f07\\f07-081b\\f07-081b-01-06.png', './words\\n03\\n03-082\\n03-082-04-03.png', './words\\g06\\g06-018c\\g06-018c-04-05.png', './words\\g06\\g06-011j\\g06-011j-06-06.png', './words\\f04\\f04-024\\f04-024-01-06.png']
['e04-030-04-08 ok 170 1489 1499 120 39 JJ sure', 'k02-102-05-03 ok 182 836 1623 69 52 PP3A he', 'a01-082u-01-04 ok 172 1582 1043 234 88 IN during', 'm01-000-07-00 ok 196 339 1998 75 107 INO of', 'g01-031-07-06 ok 152 1912 2038 167 59 NN booty', 'f07-081b-01-06 ok 168 1366 924 350 88 NN gastronomy', 'n03-082-04-03 ok 165 992 1414 118 135 NN boy', 'g06-018c-04-05 ok 182 1298 1438 96 58 ATI The', 'g06-011j-06-06 ok 182 1222 1785 146 95 CC and', 'f04-024-01-06 ok 183 1104 981 60 70 IN in']


In [8]:
# finding the maximum length and the size of the vocabulary in the training data.
train_labels_cleaned = []
characters = set()
max_len = 0

for label in train_labels:
  label = label.split(" ")[-1].strip()
  for char in label:
    characters.add(char)

  max_len = max(max_len, len(label))
  train_labels_cleaned.append(label)

print("Maximum Length:",max_len)
print("Vocab size:",len(characters))

#checking

train_labels_cleaned[:10]


Maximum Length: 21
Vocab size: 78


['sure',
 'he',
 'during',
 'of',
 'booty',
 'gastronomy',
 'boy',
 'The',
 'and',
 'in']

In [9]:
def clean_labels(labels):
  cleaned_labels = []
  for label in labels:
    label = label.split(" ")[-1].strip()
    cleaned_labels.append(label)

  return cleaned_labels

validation_labels_cleaned= clean_labels(validation_labels)
test_labels_cleaned = clean_labels(test_labels)

In [10]:
print(characters)
print(len(characters))

{'T', 'F', 'c', '"', 'd', 'B', '!', 'h', 'A', 'J', 'j', 'K', 'l', ',', 'H', 'v', '1', 'W', 'P', 'N', 'U', 'I', '9', 'x', 'f', 's', 'z', 'm', 'p', 'C', 'E', 'r', '#', 'G', '-', 'Q', 'M', 'g', 'D', 'y', '5', "'", '?', 'S', 'V', '&', 'Z', 'i', 'O', 't', '8', 'w', '.', '2', 'n', '6', 'e', '3', 'u', 'X', 'L', 'k', '7', 'a', ')', 'q', 'b', '/', '(', 'o', '+', ':', ';', '0', '*', 'R', 'Y', '4'}
78


In [11]:
AUTOTUNE = tf.data.AUTOTUNE
#TensorFlow will automatically decide the optimal values for characters.

#mapping character to integer
char_to_num = StringLookup(vocabulary = list(characters), mask_token=None)

#mapping integers back to original characters.
num_to_char = StringLookup(vocabulary = char_to_num.get_vocabulary(), mask_token=None, invert = True)




In [12]:
print(char_to_num(tf.constant(["A"])))
#here A got value 76
# the values nott in the characters  list gets value 0


tf.Tensor([9], shape=(1,), dtype=int64)


In [13]:
def distortion_free_resize(image, img_size):
  w,h = img_size
  image = tf.image.resize(image, size=(h,w),preserve_aspect_ratio=True)  # size paraeter takes height first and then width
  #The resulting image might not be exactly (h,w) pixels but will fit within these dimensions without any distortion.
  # if any pixel left we will use padding

  #checking the padding height and width
  pad_height = h-tf.shape(image)[0]
  pad_width = w-tf.shape(image)[1]

  #Only necessary if you want to do some amount of padding on both sides.
  if pad_height % 2 !=0:
    height=pad_height//2
    pad_height_top = height +1
    pad_height_bottom = height
  else:
    pad_height_top = pad_height_bottom = pad_height // 2

  if pad_width %2 != 0:
    width = pad_width //2
    pad_width_left = width +1
    pad_width_right = width
  else:
    pad_width_left = pad_width_right = pad_width //2

  image = tf.pad(
      image,
      paddings = [
          [pad_height_top, pad_height_bottom],
          [pad_width_left, pad_width_right],
          [0,0],
      ],

  )
  image = tf.transpose(image, perm=[1,0,2])
  image = tf.image.flip_left_right(image)
  # because tf.resize uses (h,w) way
  return image


In [14]:
batch_size = 32
padding_token = 99
image_width = 128
image_height = 32

def preprocessing_image(image_path, img_size=(image_width, image_height)):
  image = tf.io.read_file(image_path)
  image = tf.image.decode_png(image,channels=1) # decode the png_encoded images into tensor , channel 1 for gray scale
  image = distortion_free_resize(image,img_size)
  image = tf.cast(image, tf.float32)/255.0  # data type conversion in tensor
  return image

def vectorize_label(label):
  label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
  length = tf.shape(label)[0]
  pad_amount = max_len-length
  label = tf.pad(label, paddings=[[0,pad_amount]], constant_values = padding_token)
  return label

def process_images_labels(image_path, label):
  image = preprocessing_image(image_path)
  label = vectorize_label(label)
  return {"image":image, "label":label}

def prepare_dataset(image_paths, labels):
  dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(process_images_labels, num_parallel_calls=AUTOTUNE)
  return dataset.batch(batch_size).cache().prefetch(AUTOTUNE)

In [15]:
train_ds = prepare_dataset(train_img_paths, train_labels_cleaned)
validation_ds = prepare_dataset(validation_img_paths, validation_labels_cleaned)
test_ds = prepare_dataset(test_img_paths, test_labels_cleaned)