In [18]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib import interactive
import cv2
import svgwrite
from scipy import interpolate
from sklearn.preprocessing import LabelEncoder
from cairosvg import svg2png
import pickle

In [35]:
resampling = 0
timesteps = 30 # (if resampling = 1)

# download the training and validation data from https://github.com/emreaksan/deepwriting
folder = '/media/matthewvowels/Storage/data/HandWriting/'
data_dict = dict(np.load(folder+'deepwriting_training.npz', allow_pickle = True))

## out filename:
if resampling:
    filename = '/media/matthewvowels/Storage/data/HandWriting/char_sequences_INTERPOLATED_{}_training.pickle'.format(timesteps)
else:
    filename = '/media/matthewvowels/Storage/data/HandWriting/char_sequences_training.pickle'



In [36]:
def undo_normalization(sample, detrend_sample=False):
    # modified from https://github.com/emreaksan/deepwriting/blob/master/dataset_hw.py
    sample_copy = np.copy(sample[:,:3])
    if normalization:
        sample_copy = sample_copy*norm_std+norm_mean
    sample_copy[:,2] = sample[:,2]
    return sample_copy

def stroke_to_png_sequence(stroke, img_size, resampling, timesteps):
    # modified from https://github.com/emreaksan/deepwriting/blob/master/dataset_hw.py
    x = 0
    y = 0
    abs_ = []
    for i in range(0, len(stroke)):
        x += stroke[i,0]
        y += stroke[i,1]
        abs_.append([x,y])

    abs_ = np.asarray(abs_)

    if resampling:
        x = np.linspace(0, timesteps, abs_.shape[0])
        
        abs_x = abs_[:,0]
        f_x = interpolate.interp1d(x, abs_x)
        abs_y = abs_[:,1]
        f_y = interpolate.interp1d(x, abs_y)

        x_new = np.linspace(0, timesteps, timesteps)
        abs_x_new = np.expand_dims(f_x(x_new), 1)   
        abs_y_new = np.expand_dims(f_y(x_new), 1)
        abs_ = np.concatenate((abs_x_new, abs_y_new), 1)

    
    seq_len = len(abs_)
    img_sequence = np.zeros((seq_len, image_size, image_size)).astype('uint8')
    img_sequence[:,:,:] = 255
    
    r_x = abs_[:,0].max() - abs_[:,0].min()
    r_y = abs_[:,1].max() - abs_[:,1].min()

    if r_x >= r_y:
        abs_ = (abs_ - abs_[:,0].min(axis=0)) / r_x
        abs_ = abs_ * (0.9 - 0.1) + 0.1
        abs_[:,1] += abs(abs_[:,1].min())
        
        abs_ *= (image_size - 4)
        abs_[:,1] = abs_[:,1] - abs_[:,1].mean() + (image_size-1)/2
    else:
        abs_ = (abs_ - abs_[:,1].min(axis=0)) / r_y
        abs_ = abs_ * (0.9 - 0.1) + 0.1
        abs_[:,0] += abs(abs_[:,0].min())

        abs_ *= (image_size - 4)
        abs_[:,0] = abs_[:,0] - abs_[:,0].mean() + (image_size-1)/2
        

    abs_ = abs_.astype('int')

    for j in range(0, len(abs_)):
        cum_pen = abs_[:j+1]

        for idx in range(0,cum_pen.shape[0]):

            if (idx == 0):
                start_p = (abs_[0,0], abs_[0,1])
                end_p = (abs_[1,0], abs_[1,1] )
                cv2.line(img_sequence[j], start_p, end_p, 0, 2, 8)
            else:
                end_p = (cum_pen[idx,0], cum_pen[idx,1] )
                cv2.line(img_sequence[j], start_p, end_p, 0, 2, 8)
                start_p = end_p
        
    img_sequence = 255 - img_sequence

    return img_sequence


In [37]:
for key in data_dict.keys():
    print(key)
print(data_dict['alphabet'])

samples = data_dict['samples'] if 'samples' in data_dict.keys() else data_dict['strokes']
print(samples.shape)
char_labels = data_dict['char_labels']
alphabet = data_dict['alphabet']
eoc_labels = data_dict['eoc_labels']
normalization = 'normalization' in data_dict['preprocessing']
print(normalization)
norm_mean = data_dict['mean']
norm_std = data_dict['std']
text = data_dict['texts']
char_encoder = LabelEncoder()
char_encoder.fit(alphabet)
char_encoder.inverse_transform([0,1,2,3,4,5])

word_labels
eow_labels
max
char_labels
sow_labels
min
mean
strokes
eoc_labels
soc_labels
alphabet
subject_labels
texts
std
preprocessing
['' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'a' 'b' 'c' 'd' 'e' 'f' 'g'
 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y'
 'z' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q'
 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' "'" '.' ',' '-' '(' ')' '/']
(34577,)
True


array(['', "'", '(', ')', ',', '-'], dtype='<U1')

In [38]:
unnormed_samples = []

### Pull out sequences which are completed with ONE stroke ###

for sample in samples:
    unnormed_sample = undo_normalization(sample)
    unnormed_samples.append(unnormed_sample)
    
unnormed_samples = np.asarray(unnormed_samples)

sentence_n = 0
single_strokes = []
single_stroke_labels = []

for sentence in eoc_labels:
    seg_array = np.insert(np.argwhere(sentence == 1),[0],[0],axis=0)
    
    for i in range(len(seg_array)):
        if (i+1) < len(seg_array):
            character = unnormed_samples[sentence_n][int(seg_array[i]):int(seg_array[i+1])] 
            char_lab = char_labels[sentence_n][int(seg_array[i]+1):int(seg_array[i+1])] 
            if 1 not in character:
                if len(char_lab) > 0:
                    single_strokes.append(character)
                    single_stroke_labels.append(char_lab[0])
    sentence_n += 1
single_stroke_labels = np.asarray(single_stroke_labels)
print(len(single_stroke_labels))
print(len(single_strokes))

84474
84474


In [39]:
# individually normalize each stroke
eps = 10e-8
for i in range(len(single_strokes)):
    p = single_strokes[i][:,0].max()
    q = single_strokes[i][:,1].max()
    
    if p > q:
            
        single_strokes[i][:,0] = single_strokes[i][:,0]  / (eps+p)               
        single_strokes[i][:,1] = single_strokes[i][:,1] / (eps+p) 
    elif q > p:
                                  
        single_strokes[i][:,0] = single_strokes[i][:,0]  / (eps+q)               
        single_strokes[i][:,1] = single_strokes[i][:,1] / (eps+q) 

In [40]:
# remove the punctuation strokes 
inds = np.argsort(single_stroke_labels)
single_stroke_labels = single_stroke_labels[inds]
single_strokes = np.asarray(single_strokes)[inds]
single_stroke_labels = single_stroke_labels[12:]
single_strokes = single_strokes[12:]

In [41]:
image_size = 64
dict_ = {}

for i in range(len(single_stroke_labels)):
    
    label = str(char_encoder.inverse_transform([single_stroke_labels[i]])[0]) 
    stroke = single_strokes[i]  
    seq = stroke_to_png_sequence(stroke, image_size,resampling, timesteps) 
    if label in dict_.keys():
        dict_[label] = dict_[label] + [seq]
    else:
        dict_[label] = [seq]

    if i %1000 == 0:
        print(label+'_' + str(i))

0_0
M_1000
W_2000
a_3000
a_4000
a_5000
a_6000
b_7000
c_8000
d_9000
d_10000
e_11000
e_12000
e_13000
e_14000
e_15000
e_16000
e_17000
e_18000
e_19000
e_20000
e_21000
e_22000
e_23000
e_24000
e_25000
e_26000
e_27000
e_28000
f_29000
g_30000
h_31000
h_32000
h_33000
h_34000
h_35000
i_36000
i_37000
i_38000
i_39000
i_40000
i_41000
i_42000
k_43000
l_44000
l_45000
l_46000
m_47000
m_48000
n_49000
n_50000
n_51000
n_52000
n_53000
n_54000
n_55000
o_56000
o_57000
o_58000
o_59000
o_60000
o_61000
p_62000
r_63000
r_64000
r_65000
r_66000
r_67000
r_68000
r_69000
r_70000
s_71000
s_72000
s_73000
s_74000
t_75000
t_76000
t_77000
t_78000
u_79000
u_80000
v_81000
w_82000
y_83000
y_84000


In [42]:
print('saving into ', filename)
with open(filename, 'wb') as handle:
    pickle.dump(dict_, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(filename, 'rb') as handle:
    b = pickle.load(handle)

saving into  /media/matthewvowels/Storage/data/HandWriting/char_sequences_training.pickle
