In [1]:
import numpy as np
import pandas as pd
# from tensorflow.keras.preprocessing.
import matplotlib.pyplot as plt
import os
import cv2
import re
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from collections import namedtuple
import lmdb

In [5]:
Sample = namedtuple('Sample', 'gt_text, file_path')
Batch = namedtuple('Batch', 'imgs, gt_texts, batch_size')
data_dir = "/Users/nicolacave/dsi_galvanize/capstones/capstone3/handwriting_reader/data"
f = open(data_dir + "/" + 'gt/words.txt')
chars = set()
bad_samples_reference = ['a01-117-05-02', 'r06-022-03-05']  # known broken images in IAM dataset
for line in f:
    # ignore comment line
    if not line or line[0] == '#':
        continue

    line_split = line.strip().split(' ')
    assert len(line_split) >= 9

    # filename: part1-part2-part3 --> part1/part1-part2/part1-part2-part3.png
    file_name_split = line_split[0].split('-')
    file_name_subdir1 = file_name_split[0]
    file_name_subdir2 = f'{file_name_split[0]}-{file_name_split[1]}'
    file_base_name = line_split[0] + '.png'
    file_name = data_dir + "/" + 'img' + "/" + file_name_subdir1 + "/" + file_name_subdir2 + "/" + file_base_name

    if line_split[0] in bad_samples_reference:
        print('Ignoring known broken image:', file_name)
        continue

    # GT text are columns starting at 9
    gt_text = ' '.join(line_split[8:])
    chars = chars.union(set(list(gt_text)))

    # put sample into list
    samples = []
    samples.append(Sample(gt_text, file_name))
    

# split into training and validation set: 95% - 5%
data_split = 0.95
split_idx = int(data_split * len(samples))
train_samples = samples[:split_idx]
validation_samples = samples[split_idx:]

# put words into lists
train_words = [x.gt_text for x in train_samples]
validation_words = [x.gt_text for x in validation_samples]

# start with train set
#train_set()

# list of all chars in dataset
char_list = sorted(list(chars))
print(samples)

Ignoring known broken image: /Users/nicolacave/dsi_galvanize/capstones/capstone3/handwriting_reader/data/img/a01/a01-117/a01-117-05-02.png
Ignoring known broken image: /Users/nicolacave/dsi_galvanize/capstones/capstone3/handwriting_reader/data/img/r06/r06-022/r06-022-03-05.png
[Sample(gt_text='?', file_path='/Users/nicolacave/dsi_galvanize/capstones/capstone3/handwriting_reader/data/img/r06/r06-143/r06-143-04-10.png')]


In [12]:
print(train_words)
print(train_samples)

[]
[]


In [None]:
# pd.read_csv('../data/gt/words.txt', delimiter=" ", skiprows=18, header=None)
# data.columns = ["seq", "date", "Hour", "GMT","userID","text"]

In [None]:
DATADIR = "/Users/nicolacave/dsi_galvanize/capstones/capstone3/handwriting_reader/data/img"

CATEGORIES = [i for i in os.listdir(DATADIR) if not i == ".DS_Store"]
print(CATEGORIES)

In [None]:
# DATADIR = "/Users/nicolacave/dsi_galvanize/capstones/capstone3/handwriting_reader/data/img"

# CATEGORIES = [i for i in os.listdir(DATADIR) if not i == ".DS_Store"]
# print(CATEGORIES)

for category in CATEGORIES:  # do dogs and cats
    path = os.path.join(DATADIR,category)  # create path to dogs and cats
    #print(path)
    #if os.path.isdir(path):
    for folder in os.listdir(path):
        #print(f"folder: {folder}")
        for img in os.listdir(path+"/"+folder):
            #print(f"img: {img}")
            #print(path+"/"+folder+"/"+img)
            #if not img.startswith('.'):
            img_array = cv2.imread(os.path.join(path,folder,img) ,cv2.IMREAD_GRAYSCALE)  # convert to array
                #print(img_array)
            #plt.imshow(img_array, cmap='gray')  # graph it
            #plt.show()  # display!

        #break  # we just want one for now so break
    #break  #...and one more!

In [50]:
training_data = []
#print(len(CATEGORIES))
def create_training_data():
    print(len(CATEGORIES)*0.8)
    for category in CATEGORIES[:int(len(CATEGORIES)*0.8)]:  # do dogs and cats
        path = os.path.join(DATADIR,category)  # create path to dogs and cats
        class_num = CATEGORIES.index(category)  # get the classification  (0 or a 1). 0=dog 1=cat
        for folder in os.listdir(path):
            for img in os.listdir(path+"/"+folder):
        #for img in tqdm(os.listdir(path)):  # iterate over each image per dogs and cats
                try:
                    img_array = cv2.imread(os.path.join(path,img),cv2.IMREAD_GRAYSCALE)  # convert to array
                    new_array = cv2.resize(img_array, (128, 128), interpolation=cv2.INTER_AREA)  # resize to normalize data size
                    training_data.append([new_array, class_num])  # add this to our training_data
                except Exception as e:  # in the interest in keeping the output clean...
                    print(e)
                    #pass
                #except OSError as e:
                #    print("OSErrroBad img most likely", e, os.path.join(path,img))
                #except Exception as e:
                #    print("general exception", e, os.path.join(path,img))

create_training_data()


print(len(training_data))

In [None]:
print(len(training_data))

In [None]:
model = keras.models.Sequential([
    keras.layers.Conv2D(64, 7, activation="relu", padding="same",input_shape=[137,236,1]),#input shape: [rows, columns, channels]
    keras.layers.MaxPooling2D(2),
    keras.layers.Conv2D(128, 3, activation="relu", padding="same"),
    keras.layers.Conv2D(128, 3, activation="relu", padding="same"),
    keras.layers.MaxPooling2D(2),
    keras.layers.Conv2D(256, 3, activation="relu", padding="same"),
    keras.layers.Conv2D(256, 3, activation="relu", padding="same"),
    keras.layers.MaxPooling2D(2),
    keras.layers.Flatten(),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(10, activation="softmax")
])

model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [None]:
model.fit(training_data)