In [None]:
!pip install datasets wandb dataclasses typing pandas torch transformers jiwer sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 11.9 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.21-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 54.6 MB/s 
[?25hCollecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting typing
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 7.8 MB/s 
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 37.2 MB/s 
[?25hCollecting jiwer
  Downloading jiwer-2.3.0-py3-none-any.whl (15 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 57.3 MB/s 
[?25hColle

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datasets import load_metric, load_dataset
import torch
from transformers import (
    TrOCRProcessor,
    AutoFeatureExtractor,
    AutoTokenizer,
    VisionEncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    HfArgumentParser,
    EarlyStoppingCallback
)
import wandb
from dataclasses import dataclass, field
from typing import Optional
import os
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image

In [None]:
class OCRDataset(Dataset):
    def __init__(self, df, processor, transforms=lambda x:x, max_target_length=128):
        self.df = df
        self.processor = processor
        self.transforms = transforms
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        #file_name = self.df['file_name'][idx]
        text = self.df['words'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(self.df['image_path'][idx]).convert("RGB")
        image = self.transforms(image)
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
# encoder = "facebook/deit-base-distilled-patch16-384"
# decoder = "asafaya/bert-base-arabic"
model_name = "microsoft/trocr-base-handwritten"

In [None]:
training_args = Seq2SeqTrainingArguments(
        predict_with_generate=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size= 8, #train_args.per_device_train_batch_size,
        per_device_eval_batch_size= 8, #train_args.per_device_eval_batch_size,
        fp16=True,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-08,
        num_train_epochs=5, #train_args.num_train_epochs,
        weight_decay=0.005,
        learning_rate=5e-3, #train_args.learning_rate,
        seed=42,
        report_to="wandb",
        load_best_model_at_end=True,
        metric_for_best_model="cer",
        greater_is_better=False, 
        do_train=True,
        do_eval=True,
        do_predict=True,
        output_dir = "./"#train_args.output_dir,
    )

In [None]:
dataset = load_dataset("darentang/sroie")

Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

Downloading and preparing dataset sroie/sroie to /root/.cache/huggingface/datasets/darentang___sroie/sroie/1.0.0/26ed9374c9a15a1d2f44fd8886f679076e1a1fd7da2d53726d6e58a99436c506...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/456M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset sroie downloaded and prepared to /root/.cache/huggingface/datasets/darentang___sroie/sroie/1.0.0/26ed9374c9a15a1d2f44fd8886f679076e1a1fd7da2d53726d6e58a99436c506. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image_path'],
        num_rows: 626
    })
    test: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image_path'],
        num_rows: 347
    })
})

In [None]:
if model_name is None:
    tokenizer = AutoTokenizer.from_pretrained(decoder)
    feature_extractor = AutoFeatureExtractor.from_pretrained(encoder)
    processor = TrOCRProcessor(feature_extractor, tokenizer)
    model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder)

else:
    processor = TrOCRProcessor.from_pretrained(model_name)
    model = VisionEncoderDecoderModel.from_pretrained(model_name)

    #fn_kwargs = dict(
    #    processor = processor,
    #)
    #df = dataset.map(preprocess,fn_kwargs=fn_kwargs,remove_columns=["id"])

df_train = pd.DataFrame(dataset['train'])
# df_eval = pd.DataFrame(dataset['valid'])
df_pred = pd.DataFrame(dataset['test'])

df_train = df_train.sample(frac=1)
# df_eval = df_eval.sample(frac=1)
df_pred = df_pred.sample(frac=1)

df_train.reset_index(drop=True, inplace=True)
# df_eval.reset_index(drop=True, inplace=True)
df_pred.reset_index(drop=True, inplace=True)


transformer = lambda x: x 

train_dataset = OCRDataset(df=df_train, 
                           processor=processor, 
                           max_target_length=128, 
                           transforms=transformer)

# eval_dataset = OCRDataset(df=df_eval,
#                           processor=processor,
#                           max_target_length=128,
#                           transforms=transformer)

predict_dataset = OCRDataset(df=df_pred, 
                             processor=processor, 
                             max_target_length=128,
                             transforms=transformer)

print(f"Train dataset size: {len(train_dataset)}")
# print(f"Eval dataset size: {len(eval_dataset)}")
print(f"Predict dataset size: {len(predict_dataset)}")

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

Downloading:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train dataset size: 626
Predict dataset size: 347


In [None]:
cer_metric = load_metric("cer")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=processor.feature_extractor,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        # eval_dataset=test_dataset,
        data_collator=default_data_collator,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

Using cuda_amp half precision backend


In [None]:
print("Training model")
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# print("Evaluating model")
# metrics = trainer.evaluate(metric_key_prefix="eval")
# trainer.log_metrics("eval", metrics)
# trainer.save_metrics("eval", metrics)
print("Predicting")
predict_results = trainer.predict(
    predict_dataset,
    metric_key_prefix="predict",
)
metrics = predict_results.metrics

trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)

if trainer.is_world_process_zero():
    if training_args.predict_with_generate:
        predictions = processor.batch_decode(
            predict_results.predictions,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        predictions = [pred.strip() for pred in predictions]
        output_prediction_file = os.path.join(
            "./", "generated_predictions.txt"
        )
        print(predictions)
        with open(output_prediction_file, "w") as writer:
            writer.write("\n".join(predictions))

***** Running training *****
  Num examples = 626
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 395
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Training model


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


RuntimeError: ignored

In [None]:
image =  df_pred['image'][4]
image

In [None]:
df_pred['text'][4]

In [None]:
predictions[3]