# Inference Notebook
Notebook is used for training the OCR Engine (or loading a custom model), Inference and Evaluation.

In [None]:
#% load_ext autoreload
#% autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # use 2, 3, 4 or 5

In [3]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import evaluate
from pathlib import Path
sns.set_theme(style="whitegrid")
import datetime
from Levenshtein import ratio

import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(Path(module_path).joinpath("src").as_posix())

from custom_model import CustomTrOCR

# Load Data and Transformation

In [4]:
cm = CustomTrOCR()

In [None]:
df = cm.load_data("../data/extracted/lines_data_cw.json")
print(f"Shape of full provided data set {df.shape}")
df.head(5)

In [None]:
df = cm.prepare_data(outlier_threshold=110, strip_whitespaces=True)
# Save data for eda 
df.to_json("../data/extracted/preprocessed_lines_data_cw.json")
df.head(5)

______
# Load base model

In [None]:
###################################################################
### Execute this cell only once to get the untrained base stage ###
###################################################################

# Load the model from huggingface
#cm.load_model('microsoft/trocr-base-stage1', source='huggingface')

# Save the model to a directory
#cm.dump_model("../models/trocr_base_stage1")

In [4]:
# Enlist current models available in models dir
cm.list_current_models()

../models/Custom_Split_64b_10e_ep5
../models/Custom_Split_64b_10e_ep3
../models/Custom_Split_64b_10e_ep6
../models/Custom_Split_64b_10e_ep8
../models/Custom_Split_64b_10e_ep4
../models/Custom_Split_32b_5e_50subsample_ep0
../models/Custom_Split_32b_5e_50subsample_ep4
../models/Custom_Split_64b_10e_ep9
../models/Custom_Split_32b_5e_50subsample_ep2
../models/Custom_Split_64b_10e_ep0
../models/Custom_Split_32b_5e_50subsample_ep3
../models/Custom_Split_64b_10e_ep1
../models/Custom_Split_64b_10e_ep2
../models/Custom_Split_64b_10e_ep7
../models/trocr_base_stage1
../models/Custom_Split_32b_5e_50subsample_ep1


In [None]:
# Load the base model from local directory after loading and saving the model from huggingface
cm.load_model("../models/trocr_base_stage1")

_____
# Fine Tuning
Skip this section if you want to use a already pretrained custom model

In [6]:
MODEL_NAME = "Custom_Split_64b_10e_XXX"
SAVE_DIR = "../models/" + MODEL_NAME
N_EPOCHS = 10

assert MODEL_NAME != "Custom_Split_64b_10e", print("Do not overwrite master model!")

In [None]:
cm.train_model(test_size=0.1,
               val_size=0.1,
               batch_size=64,
               shuffle=True,
               epochs=N_EPOCHS,
               save_dir=SAVE_DIR,
               take_subsample=False,               # Set to False to train on full dataset
               #subsample_size=0.5,
               remove_chinese_letter=True,
               use_custom_train_test_split=True)

_______
# Evaluate Training

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
# Loss plot settings
sns.lineplot(x=np.arange(0, len(cm.history_loss)),
             y=cm.history_loss,
             ax=axes[0])
axes[0].set_title("Cross Entropy Loss after each Epoch")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Loss")
# CER plot settings
sns.lineplot(x=np.arange(0, len(cm.history_cer)),
             y=cm.history_cer,
             ax=axes[1],
             color="red")
axes[1].set_title("Character Error Rate (CER) after each Epoch")
axes[1].set_xlabel("Epoch")
axes[1].set_ylabel("CER")
# Save fig in img dir
plt.savefig(f"../img/{MODEL_NAME}_eval_{str(datetime.datetime.now()).replace(':', '').strip()}.png", dpi=256)
fig.tight_layout()

__________
```
For the evaluation of the Test Set, please go to evaluation notebook.
```