In [None]:
# Installing Necessary Packages
!sudo apt-get install-y libjpeg-dev zlib1g-dev graphviz 
%pip install scikit-image python-docx pymupdf opencv-python torch torchvision tens

# Importing Necessary Modules
import fitz
import cv2
import csv
import os
from docx import Document
import string
import requests
import pandas as pd
import numpy as np
from PIL import Image, ImageOps
from IPython.display import clear_output as cls
from glob import glob
from tqdm import tqdm
import tensorflow.data as tfd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from tensorflow.keras.utils import plot_model

# Downloading the Dataset on Colab/Local Notebook
!pip install gdown
!gdown 'http://drive.google.com/uc?id=10NX_UbV2HMbPEO2fvKYAIXOOOec0g38g'  # Virtuosa Dataset
!gdown 'http://drive.google.com/uc?id=1x6FS3z4WhsHS7s38a2oSH8JnLQ_u_f21'  # Perfecto Dataset
!gdown 'http://drive.google.com/uc?id=1r7TjJ9RjNZHxAzKhd4uOaRWanrQXIqw8'  # utils.py

# Image Augmentation Functions
from utils import rotation_aug
training_data = './training_data'
rotation_aug(training_data)
print("Image augmentation by Rotation completed.")

from utils import gaussian_noise_aug
gaussian_noise_aug(training_data)
print("Image augmentation by Gaussian Noise completed.")

# Creating CSV Files
from utils import create_csv_from_folder
training_data = './training_data'
train_csv_path = './training_data.csv'
create_csv_from_folder(training_data, train_csv_path)
test_data1 = "./testing_data1"
test_csv_path1 = './testing_data1.csv'
create_csv_from_folder(test_data1, test_csv_path1)

# Splitting Train and Test Dataset
df = pd.read_csv('training_data.csv')
df.shape
df['IDENTITY'] = df['IDENTITY'].apply(lambda x: x.split('_')[0]).apply(lambda x: str(x))
TRAIN_SIZE = int(df.shape[0] * 0.8)
df_train = df.iloc[:TRAIN_SIZE]
df_valid = df.iloc[TRAIN_SIZE + 1:]
df_train.to_csv('train.csv', index=False)
df_valid.to_csv('valid.csv', index=False)

# Configuration Parameters
IMG_WIDTH = 200
IMG_HEIGHT = 50
IMAGE_SIZE = (IMG_WIDTH, IMG_HEIGHT)
BATCH_SIZE = 16
EPOCHS = 15
MODEL_NAME = 'SpanishOCR'
CALLBACKS = [
    callbacks.EarlyStopping(patience=10, restore_best_weights=True),
    callbacks.ModelCheckpoint(filepath=MODEL_NAME + ".h5", save_best_only=True)
]
LEARNING_RATE = 1e-3
np.random.seed(2569)
tf.random.set_seed(2569)
train_csv_path = './train.csv'
valid_csv_path = './valid.csv'
test_csv_path = './testing_data1.csv'
train_image_dir = './training_data'
valid_image_dir = './training_data'
test_image_dir = './testing_data1'
AUTOTUNE = tfd.AUTOTUNE

# Extracting Unique Characters
labels = [str(word) for word in df['IDENTITY'].to_numpy()]
unique_chars = set(char for word in labels for char in word)
n_classes = len(unique_chars)
print(f"Total number of unique characters: {n_classes}")
print(f"Unique Characters:\n{unique_chars}")
MAX_LABEL_LENGTH = max(map(len, labels))
print(f"Maximum length of a label: {MAX_LABEL_LENGTH}")

# Mapping Characters to Numeric Values
char_to_num = layers.StringLookup(
    vocabulary=list(unique_chars),
    mask_token=None
)
num_to_char = layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(),
    mask_token=None,
    invert=True
)

# Image Loading and Preprocessing
def load_image(image_path: str):
    '''Loads and preprocesses images.'''
    image = tf.io.read_file(image_path)
    decoded_image = tf.image.decode_jpeg(contents=image, channels=1)
    cnvt_image = tf.image.convert_image_dtype(image=decoded_image, dtype=tf.float32)
    resized_image = tf.image.resize(images=cnvt_image, size=(IMG_HEIGHT, IMG_WIDTH))
    image = tf.transpose(resized_image, perm=[1, 0, 2])
    return image

# Encoding Single Sample
def encode_single_sample(image_path: str, label: str):
    '''Encodes a single sample with image and label.'''
    image = load_image(image_path)
    chars = tf.strings.unicode_split(label, input_encoding='UTF-8')
    vecs = char_to_num(chars)
    pad_size = MAX_LABEL_LENGTH - tf.shape(vecs)[0]
    vecs = tf.pad(vecs, paddings=[[0, pad_size]], constant_values=n_classes)
    return {'image': image, 'label': vecs}

# Creating TensorFlow Datasets
train_ds = tf.data.Dataset.from_tensor_slices(
    (np.array(train_csv['FILENAME'].to_list()), np.array(train_csv['IDENTITY'].to_list()))
).shuffle(train_size).map(encode_single_sample, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)
valid_ds = tf.data.Dataset.from_tensor_slices(
    (np.array(valid_csv['FILENAME'].to_list()), np.array(valid_csv['IDENTITY'].to_list()))
).map(encode_single_sample, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices(
    (np.array(test_csv['FILENAME'].to_list()), np.array(test_csv['IDENTITY'].to_list()))
).map(encode_single_sample, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)

# Resizing Images in Folder
def resize_images_in_folder(input_folder, new_size=(200, 50)):
    for filename in os.listdir(input_folder):
        with Image.open(os.path.join(input_folder, filename)) as img:
            resized_img = img.resize(new_size)
            output_filename = os.path.splitext(filename)[0] + '.png'
            resized_img.save(os.path.join(input_folder, output_filename))

training_image_dir = "./training_data"
resize_images_in_folder(training_image_dir)
test_image_dir = "./testing_data1"
resize_images_in_folder(test_image_dir)

# Displaying Images
def show_images(data, GRID=[4, 4], FIGSIZE=(25, 8), cmap='binary_r', model=None, decode_pred=None):
    plt.figure(figsize=FIGSIZE)
    n_rows, n_cols = GRID
    data = next(iter(data))
    images, labels = data['image'], data['label']
    for index, (image, label) in enumerate(zip(images, labels)):
        text_label = num_to_char(label)
        text_label = tf.strings.reduce_join(text_label).numpy().decode('UTF-8')
        text_label = text_label.replace("[UNK]", "").strip()
        plt.subplot(n_rows, n_cols, index + 1)
        plt.imshow(tf.transpose(image, perm=[1, 0, 2]), cmap=cmap)
        plt.axis('off')
        if model is not None and decode_pred is not None:
            pred = model.predict(tf.expand_dims(image, axis=0))
            pred = decode_pred(pred)[0]
            title = f"True: {text_label}\nPred: {pred}"
            plt.title(title)
        else:
            plt.title(text_label)
    plt.show()

show_images(data=train_ds, cmap='gray')

# Compiling and Training the Model
ocr_model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3))
lr_scheduler = callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=3, verbose=1, min_lr=1e-6
)
history = ocr_model.fit(
    train_ds,
    validation_data=valid_ds,
    epochs=EPOCHS,
    callbacks=[
        callbacks.EarlyStopping(patience=5, restore_best_weights=True),
        lr_scheduler
    ]
)
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.legend(fontsize=15)
plt.xlabel("Epochs")
plt.ylabel("CTC Loss Score")
plt.title("Learning Curve", fontsize=15)
plt.grid()
plt.show()

# Inference Model
inference_model = keras.Model(
    inputs=ocr_model.input[0],
    outputs=ocr_model.get_layer(name="dense_1").output
)
print(decode_pred(inference_model.predict(test_ds))[:10])  # Convert to test_ds

[sudo] password for psyduck: 