In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets wandb dataclasses typing pandas torch transformers jiwer sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 7.5 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.19-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 63.1 MB/s 
[?25hCollecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting typing
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 7.6 MB/s 
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 33.0 MB/s 
[?25hCollecting jiwer
  Downloading jiwer-2.3.0-py3-none-any.whl (15 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 50.4 MB/s 
[?25hCollec

In [None]:
from datasets import load_metric, load_dataset
import torch
from transformers import (
    TrOCRProcessor,
    AutoFeatureExtractor,
    AutoTokenizer,
    VisionEncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    HfArgumentParser,
)
import wandb
from dataclasses import dataclass, field
from typing import Optional
import os
from torch.utils.data import Dataset
import pandas as pd

In [None]:
class OCRDataset(Dataset):
    def __init__(self, df, processor, transforms=lambda x:x, max_target_length=128):
        self.df = df
        self.processor = processor
        self.transforms = transforms
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        #file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        # prepare image (i.e. resize + normalize)
        image = self.df['image'][idx].convert("RGB")
        image = self.transforms(image)
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
encoder = "facebook/deit-base-distilled-patch16-384"
decoder = "asafaya/bert-base-arabic"
model_name = "microsoft/trocr-small-handwritten"

In [None]:
training_args = Seq2SeqTrainingArguments(
        predict_with_generate=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size= 8, #train_args.per_device_train_batch_size,
        per_device_eval_batch_size= 8, #train_args.per_device_eval_batch_size,
        fp16=True,
        adam_beta1=0.9,
        adam_beta2=0.98,
        adam_epsilon=1e-08,
        num_train_epochs=10, #train_args.num_train_epochs,
        weight_decay=0.005,
        learning_rate=5e-5, #train_args.learning_rate,
        seed=42,
        report_to="wandb",
        load_best_model_at_end=True,
        metric_for_best_model="cer",
        do_train=True,
        do_eval=True,
        do_predict=True,
        output_dir = "./"#train_args.output_dir,
    )

In [None]:
dataset = load_dataset(
        "gagan3012/OnlineKhatt"
)

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Using custom data configuration gagan3012--OnlineKhatt-e66084177a1d53f0


Downloading and preparing dataset image_folder/default (download: 257.30 MiB, generated: -4491153882 bytes, post-processed: Unknown size, total: -4221360242 bytes) to /root/.cache/huggingface/datasets/gagan3012___parquet/gagan3012--OnlineKhatt-e66084177a1d53f0/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/gagan3012___parquet/gagan3012--OnlineKhatt-e66084177a1d53f0/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
if model_name is None:
    tokenizer = AutoTokenizer.from_pretrained(decoder)
    feature_extractor = AutoFeatureExtractor.from_pretrained(encoder)
    processor = TrOCRProcessor(feature_extractor, tokenizer)
    model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder)

else:
    processor = TrOCRProcessor.from_pretrained(model_name)
    model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
        encoder, decoder)

    #fn_kwargs = dict(
    #    processor = processor,
    #)
    #df = dataset.map(preprocess,fn_kwargs=fn_kwargs,remove_columns=["id"])

df_train = pd.DataFrame(dataset['train'])
df_eval = pd.DataFrame(dataset['dev'])
df_pred = pd.DataFrame(dataset['test'])

df_train = df_train.sample(frac=1)
df_eval = df_eval.sample(frac=1)
df_pred = df_pred.sample(frac=1)

transformer = lambda x: x 

train_dataset = OCRDataset(df=df_train, 
                           processor=processor, 
                           max_target_length=128, 
                           transforms=transformer)

eval_dataset = OCRDataset(df=df_eval,
                          processor=processor,
                          max_target_length=128,
                          transforms=transformer)

predict_dataset = OCRDataset(df=df_pred, 
                             processor=processor, 
                             max_target_length=128,
                             transforms=transformer)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")
print(f"Predict dataset size: {len(predict_dataset)}")

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

Downloading:   0%|          | 0.00/276 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/327 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/238 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/68.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/334M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/deit-base-distilled-patch16-384 were not used when initializing DeiTModel: ['cls_classifier.bias', 'distillation_classifier.weight', 'cls_classifier.weight', 'distillation_classifier.bias']
- This IS expected if you are initializing DeiTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DeiTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DeiTModel were not initialized from the model checkpoint at facebook/deit-base-distilled-patch16-384 and are newly initialized: ['deit.pooler.dense.bias', 'deit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference

Downloading:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['bert.encoder.layer.5.crossattention.self.query.weight', 'bert.encoder.layer.3.crossattention.self.key.weight', 'bert.encoder.layer.6.crossattention.output.dense.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.11.crossattention.self.value.bias', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.10.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.8.crossattention.self.query.weight', 'bert.encoder.layer.7.crossattention.self.query.weight', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.6.crossattention.self.query.weight', 'bert.encoder.layer.7.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.value.

Train dataset size: 7648
Eval dataset size: 425
Predict dataset size: 425


In [None]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
cer_metric = load_metric("cer")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [None]:
trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=processor.feature_extractor,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=default_data_collator
)

RuntimeError: ignored

In [None]:
print("Training model")
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print("Evaluating model")
metrics = trainer.evaluate(metric_key_prefix="eval")
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
print("Predicting")
predict_results = trainer.predict(
    predict_dataset,
    metric_key_prefix="predict",
)
metrics = predict_results.metrics
max_predict_samples = (
    data_args.max_predict_samples
    if data_args.max_predict_samples is not None
    else len(predict_dataset)
)
metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)

if trainer.is_world_process_zero():
    if training_args.predict_with_generate:
        predictions = processor.batch_decode(
            predict_results.predictions,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        predictions = [pred.strip() for pred in predictions]
        output_prediction_file = os.path.join(
            model_args.save_dir, "generated_predictions.txt"
        )
        print(predictions)
        with open(output_prediction_file, "w") as writer:
            writer.write("\n".join(predictions))

***** Running training *****
  Num examples = 7648
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9560


Training model


RuntimeError: ignored