In [1]:
%load_ext tensorboard

In [2]:
from transformers import (
    T5ForConditionalGeneration,
    AutoModelForSeq2SeqLM,
    AutoModel,
    DefaultDataCollator,
    DataCollatorForSeq2Seq,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    pipeline
)

from transformers.integrations import TensorBoardCallback

from datetime import datetime
from datasets import load_dataset, Dataset, DatasetDict,set_caching_enabled, concatenate_datasets
import nlpaug.augmenter.char as nac
import torch
from torch.utils.tensorboard import SummaryWriter

import yaml
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import set_caching_enabled
set_caching_enabled(False)


  from .autonotebook import tqdm as notebook_tqdm
  set_caching_enabled(False)


In [3]:
with open("experiments.yml") as stream:
  runtime_conf = yaml.safe_load(stream)

In [4]:
global_vars = runtime_conf["global"]
experiment_params = runtime_conf["training"]

In [5]:
EXPERIMENT_NAME = global_vars["experiment_name"]
N_SAMPLES = global_vars['n_samples']
N_AUGS = global_vars['n_augs']
AUG_PARAMS = global_vars["augmentation_params"]
CHECKPOINT = global_vars["checkpoint"]
LABELS = global_vars["labels"]
MAX_LENGTH = global_vars["max_length"]

In [6]:
model = T5ForConditionalGeneration.from_pretrained(CHECKPOINT, num_labels=LABELS, max_length=MAX_LENGTH)
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT,max_length=MAX_LENGTH)

In [7]:
def with_noisy_lines(row, src_col: str) -> list:
    aug = nac.OcrAug(AUG_PARAMS)
    sentence = row[src_col]
    aug_sents = aug.augment(sentence)[0]
    return dict(text=sentence,tgt_col=aug_sents)

In [8]:
def with_mapped_encodings(row, src_col: str, aug_col: str) -> dict:

  src = tokenizer.batch_encode_plus(
      row[src_col],
      truncation=True,
      padding="max_length",
      max_length=128,
      return_tensors="pt"
  )

  aug = tokenizer.batch_encode_plus(
      row[aug_col],
      truncation=True,
      padding="max_length",
      max_length=128,
      return_tensors="pt"
  )

  return dict(

      input_ids=aug.input_ids,
      attention_mask=aug.attention_mask,
      decoder_attention_mask=src.attention_mask,
      labels=src.input_ids
  )

In [9]:
def with_feature_extraction(data: Dataset, num_perms: int=1) -> DatasetDict:
  # generate augmented OCR noise and get input ids for byte tokens
  dataset_builder: list = []
  for i in range(num_perms):
    tmp  = data.map(with_noisy_lines, fn_kwargs=dict(src_col="text"))
    dataset_builder.append(tmp)
  dataset = concatenate_datasets(dataset_builder)
  dataset = dataset.map(
      with_mapped_encodings, fn_kwargs=dict(src_col="text", aug_col="tgt_col"),
      batched=True,
    )
  # reformat for training/inference
  dataset = dataset.remove_columns(["text", "tgt_col"])
  dataset = dataset.with_format(type='torch')
  return dataset.train_test_split(test_size=0.2).values()

In [10]:
dataset = load_dataset("lowem1/training-invoices")["train"]
dataset = dataset.rename_column("line_data", "text")
dataset = dataset.rename_column("label", "cls")

Downloading readme: 100%|██████████| 511/511 [00:00<00:00, 2.14MB/s]


Downloading and preparing dataset None/None to /home/vscode/.cache/huggingface/datasets/lowem1___parquet/lowem1--training-invoices-e93c374bc59d64cc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data: 100%|██████████| 6.49k/6.49k [00:00<00:00, 10.2MB/s]
Downloading data files: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 673.35it/s]
                                                                                 

Dataset parquet downloaded and prepared to /home/vscode/.cache/huggingface/datasets/lowem1___parquet/lowem1--training-invoices-e93c374bc59d64cc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 28.98it/s]


In [11]:
# train,val = with_feature_extraction(dataset,num_perms=N_AUGS)

In [12]:
# %tensorboard --logdir ./logs  --port 6006

In [13]:
# for name, args in experiment_params.items():
#   training_args = TrainingArguments(**args["training_args"])
#   torch.cuda.empty_cache()
#   trainer =  Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train,
#     eval_dataset=val,
#     callbacks=[TensorBoardCallback(SummaryWriter(log_dir=f"./logs/{name}"))]
#   )
#   trainer.train()