In [1]:
!git clone https://github.com/omlgg/Online-Handwriting-recognition-using-Transformer.git

Cloning into 'Online-Handwriting-recognition-using-Transformer'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 28 (delta 2), reused 6 (delta 1), pack-reused 18 (from 1)[K
Receiving objects: 100% (28/28), 69.41 MiB | 15.63 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Updating files: 100% (13/13), done.


In [2]:
%cd Online-Handwriting-recognition-using-Transformer

/content/Online-Handwriting-recognition-using-Transformer


# Preprocess Data

In [3]:
import pandas as pd
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import os
import numpy as np
from preprocessing import *
from features_extraction import *

from glob import glob

In [4]:
# extract strokes (x and y cordinate) from xml file and return a list of strokes as ink object [mark pen up with 1]
def extract_strokes(sample):
  tree = ET.parse(sample)
  r = tree.getroot()
  stroke_set = r.find("StrokeSet")
  strokes = []
  for stroke_node in stroke_set:
      for point in stroke_node:
          x = int(point.attrib['x'])
          y = int(point.attrib['y'])
          time = float(point.attrib['time'])
          strokes.append([x,y,0])
      strokes[-1][-1] = 1 # pen-up
  return strokes

In [5]:
#  extract handwriting from the raw data
%cd data
!tar -xf "ascii-all.tar.gz"
!tar -xf "lineStrokes-all.tar.gz"
%cd ..

/content/Online-Handwriting-recognition-using-Transformer/data
/content/Online-Handwriting-recognition-using-Transformer


In [6]:
PATH= 'data/lineStrokes'
xml_files = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.xml'))]
PATH= 'data/ascii'
txt_files = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.txt'))]

In [7]:
#  Clean up the text and format it to be ready for preprocessing
data=[]
for txt_file in txt_files:
  with open(txt_file) as f:
    lines = f.readlines()
    try:
      indx = lines.index('CSR:\n')
    except:
      # print(lines)
      # print(txt_file)
      indx = lines.index('CSR: \n')
      # break
    lines = lines[indx+2:]
    for i,line in enumerate(lines):
      xml_file = txt_file.replace('ascii','lineStrokes').replace('.txt','')
      xml_file = xml_file + f'-{i+1:02}.xml'
      data.append({'file_path':xml_file,'transcript':line.replace('\n','')})
print(data)



In [8]:
#  Convert the list to dataframe to easily process the data
df = pd.DataFrame(data)
df['exists'] = df['file_path'].apply(lambda x :os.path.exists(x))

In [9]:
df['exists'].value_counts()

Unnamed: 0_level_0,count
exists,Unnamed: 1_level_1
True,12187
False,1021


In [10]:
df.head()

Unnamed: 0,file_path,transcript,exists
0,data/lineStrokes/g06/g06-183/g06-183z-01.xml,This happy but impermanent State,True
1,data/lineStrokes/g06/g06-183/g06-183z-02.xml,of affairs was brought about by a,True
2,data/lineStrokes/g06/g06-183/g06-183z-03.xml,very careful application of the,True
3,data/lineStrokes/g06/g06-183/g06-183z-04.xml,homoeopathic system. At that,True
4,data/lineStrokes/g06/g06-183/g06-183z-05.xml,"date the doctrine that ""likes",True


In [11]:
# Keep only the files that exist
data =df[df['exists']]

# Filter for testing
data = data.head(256)

In [12]:
#  name the preprocessing methods and features extraction
NORM_ARGS = ["origin","smooth", "slope", "resample", "slant", "height"]
FEAT_ARGS = ["x_cor","y_cor","penup","dir", "curv", "vic_aspect", "vic_curl", "vic_line", "vic_slope", "bitmap"]

In [13]:
#  iterate over the data, preporcess and extract the features, then save the data to binary file
!mkdir data/bin_files
from tqdm import tqdm
for i,sample in tqdm(data.iterrows()):
  strokes = extract_strokes(sample['file_path'])
  strokes = np. array(strokes)
  ink = preprocess_handwriting(strokes, NORM_ARGS)
  feat = calculate_feature_vector_sequence(ink, FEAT_ARGS)
  outfilename = sample['file_path'].split('/')[-1].replace('.xml','.bin')
  outfilename= 'data/bin_files/'+ outfilename
  feat.tofile(outfilename)

256it [01:51,  2.31it/s]


In [14]:
data.head()

Unnamed: 0,file_path,transcript,exists
0,data/lineStrokes/g06/g06-183/g06-183z-01.xml,This happy but impermanent State,True
1,data/lineStrokes/g06/g06-183/g06-183z-02.xml,of affairs was brought about by a,True
2,data/lineStrokes/g06/g06-183/g06-183z-03.xml,very careful application of the,True
3,data/lineStrokes/g06/g06-183/g06-183z-04.xml,homoeopathic system. At that,True
4,data/lineStrokes/g06/g06-183/g06-183z-05.xml,"date the doctrine that ""likes",True


# Train model

In [15]:
import os
import random
from glob import glob
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from model import Transformer
from utils import VectorizeChar, DisplayOutputs, CustomSchedule, path_to_features , wer, cer
import pandas as pd

In [16]:

def set_seeds(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

set_seeds()

In [17]:
# to tokenize the text
vectorizer = VectorizeChar(100)

In [18]:
max_target_len = 100  # all transcripts in out data are < 100 characters
print("vocab size", len(vectorizer.get_vocabulary()))

vocab size 95


In [19]:
#  utilities for data preparation for training and testing [Convert to TF Dataset]
def encode(path,txt):
  """Encode a text into a sequence of vectors."""
  txt = txt.numpy().decode('utf8')
  y = tf.convert_to_tensor(vectorizer(txt),dtype=tf.int64)
  x = path_to_features(path)
  return x,y
def tf_encode(path,txt):
  """ util py function to be used with tensors."""
  x,y = tf.py_function(encode, [path,txt], [tf.float32,tf.int64])
  return x,y
def create_tf_dataset(data, batch_size=4):
  """Create a tf.data.Dataset from the given data."""
  dataset = tf.data.Dataset.from_tensor_slices((np.array(data["filename"].values),np.array(data["transcript"].values)))
  dataset = dataset.map(tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.padded_batch(batch_size, padded_shapes=([None,20], [None]))
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  dataset = dataset.map(lambda x, y: {"source": x, "target": y})
  return dataset


## Create and train end to end model

In [21]:
#df = pd.read_excel('data/iam.xlsx')
#df['filename'] = df['file_path'].apply(lambda x: "data/content/bin_files/"+x.split('/')[-1].replace('.xml','.bin'))
#print(df)

# Testing
df = data.copy()
df['filename'] = df['file_path'].apply(lambda x: "data/bin_files/"+x.split('/')[-1].replace('.xml','.bin'))
print(df)

# split data into train and test
msk = np.random.rand(len(df)) < 0.95
train_data = df[msk]
dev_data = df[~msk]
# create tf dataset for training and validation
ds = create_tf_dataset(train_data, batch_size=32)
val_ds = create_tf_dataset(dev_data, batch_size=8)

                                        file_path  \
0    data/lineStrokes/g06/g06-183/g06-183z-01.xml   
1    data/lineStrokes/g06/g06-183/g06-183z-02.xml   
2    data/lineStrokes/g06/g06-183/g06-183z-03.xml   
3    data/lineStrokes/g06/g06-183/g06-183z-04.xml   
4    data/lineStrokes/g06/g06-183/g06-183z-05.xml   
..                                            ...   
313  data/lineStrokes/b04/b04-363/b04-363z-01.xml   
314  data/lineStrokes/b04/b04-363/b04-363z-02.xml   
315  data/lineStrokes/b04/b04-363/b04-363z-03.xml   
316  data/lineStrokes/b04/b04-363/b04-363z-04.xml   
317  data/lineStrokes/b04/b04-363/b04-363z-05.xml   

                            transcript  exists                        filename  
0     This happy but impermanent State    True  data/bin_files/g06-183z-01.bin  
1    of affairs was brought about by a    True  data/bin_files/g06-183z-02.bin  
2      very careful application of the    True  data/bin_files/g06-183z-03.bin  
3         homoeopathic system. At that 

In [22]:
len(train_data),len(dev_data)

(238, 18)

In [23]:
# to display the output of the model after each epoch for the first batch of the validation set
batch = next(iter(val_ds))
# The vocabulary to convert predicted indices into characters
idx_to_char = vectorizer.get_vocabulary()
display_cb = DisplayOutputs(
    batch, idx_to_char, target_start_token_idx=2, target_end_token_idx=3
)  # set the arguments as per vocabulary index for '<' and '>'

# create the model and compile it
model = Transformer(
    num_hid=100,
    num_head=2,
    num_feed_forward=256,
    target_maxlen=max_target_len,
    num_layers_enc=4,
    num_layers_dec=1,
    num_classes=95,
)
loss_fn = tf.keras.losses.CategoricalCrossentropy(
    from_logits=True, label_smoothing=0.1,
)

learning_rate = CustomSchedule(
    init_lr=0.00001,
    lr_after_warmup=0.001,
    final_lr=0.00001,
    warmup_epochs=15,
    decay_epochs=85,
    steps_per_epoch=len(ds),
)
optimizer = keras.optimizers.Adam(learning_rate)
# optimizer = keras.optimizers.Adam()
model.compile(optimizer=optimizer, loss=loss_fn)


In [24]:
model.summary()

In [28]:
import keras
import tensorflow as tf

# Use GPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 18173072750026274980
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14626652160
locality {
  bus_id: 1
  links {
  }
}
incarnation: 784452693517377072
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [29]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5



In [30]:
#  train the model
print(ds)
print(val_ds)
#ds = tf.convert_to_tensor(ds)
history = model.fit(ds, validation_data=val_ds, callbacks=[display_cb], epochs=20)

<_MapDataset element_spec={'source': TensorSpec(shape=(None, None, 20), dtype=tf.float32, name=None), 'target': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>
<_MapDataset element_spec={'source': TensorSpec(shape=(None, None, 20), dtype=tf.float32, name=None), 'target': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>
Epoch 1/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - loss: 3.4989



target:     <should be treated by likes",>
prediction: <l ]Kl m[ed p%iUJH^(pCl qB<p{iUta.>

target:     <In midApril Anglesey>
prediction: <88yKl m[ed ]IhUp{YkpCl qB<p{iUta.>

target:     <In midapril Anglesey moved his family>
prediction: <l ]Kl m[ed p%iUJH^(pCl qB<p{iUta.>

target:     <beauty of the place quite>
prediction: <88yKl m[ed p%iUp{YkpCl qB<p{iUta.>

target:     <taking Clarence, who was now>
prediction: <l ]Kl m[ed p%iUp{YkO=k!Yl{p{iUta.>

target:     <this to be wondered at, for even today,>
prediction: <88yKl m[ed ]IhUJH^(pCl qB<p{iUta.>

target:     <except for them, Whitehall should>
prediction: <l ]Kl m[ed p%iUJH^(pCl qB<p{iUta.>

target:     <be deserted. It is their protest>
prediction: <l ]Kl m[ed p%iUJH^(pCl qB<p{iUta.>

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 13s/step - loss: 3.4860 - val_loss: 3.2972
Epoch 2/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - los

In [31]:
# Evaluate the model
test_dataset = create_tf_dataset(dev_data, batch_size=32)
idx_to_char = vectorizer.get_vocabulary()

result=[]
for batch_idx, batch in enumerate( test_dataset):
  print(batch_idx)
  source = batch["source"]
  target = batch["target"].numpy()
  bs = tf.shape(source)[0]
  preds = model.generate(source, 2)
  preds = preds.numpy()
  for i in range(bs):
      target_text = "".join([idx_to_char[_] for _ in target[i, :]])
      prediction = ""
      for idx in preds[i, :]:
          prediction += idx_to_char[idx]
          if idx == 3:
              break
      result.append({"target":target_text.replace('-',''),"prediction":prediction})
      # print(f"target:     {target_text.replace('-','')}")
      # print(f"prediction: {prediction}\n")

0




In [32]:
# Convert the result to a pandas dataframe
prediction_result = pd.DataFrame(result)
prediction_result.head()

Unnamed: 0,target,prediction
0,"<should be treated by likes"",>",<the erede the tome athere the tes te>
1,<In midApril Anglesey>,<the erede the tome athere the>
2,<In midapril Anglesey moved his family>,<the erede the tome athere the tes te>
3,<beauty of the place quite>,<the erede the tome athere the>
4,"<taking Clarence, who was now>",<the erede the tome athere the>


In [33]:
#  Normalize the results
prediction_result['prediction'] = prediction_result['prediction'].apply(lambda x:x.replace('<','').replace('>',''))
prediction_result['target'] = prediction_result['target'].apply(lambda x:x.replace('<','').replace('>',''))

In [34]:
#  Calculate the accuracy of the model using character error rate (CER) and word error rate (WER)
prediction_result['wer'] = prediction_result.apply(lambda x: wer(x.target, x.prediction) , axis=1)
prediction_result['cer'] = prediction_result.apply(lambda x: cer(x.target, x.prediction) , axis=1)
print(f" cer {prediction_result['cer'].mean():.5f} wer {prediction_result['wer'].mean():.5f}")

 cer 0.88589 wer 1.00000


In [36]:
#  Write the results to a excel file
!mkdir Results
prediction_result.to_excel('Results/iam_results_01.xlsx')

In [37]:
#  Save the model
model.save_weights('Models/iam80.h5')

ValueError: The filename must end in `.weights.h5`. Received: filepath=Models/iam80.h5