# Load Module

In [1]:
import os

os.chdir("..")

In [2]:
os.environ["WANDB_PROJECT"] = "spell-correction"

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer
import wandb

In [4]:
import time
from datetime import datetime

In [5]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmdmmn378[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Global Config

In [6]:
MODEL_CHECKPOINT = "csebuetnlp/banglat5_small"

In [7]:
MODEL_NAME = "spell-correction"

In [8]:
SPLIT_CONFIG = {
    "train": "./datasets/correction_train.jsonl",
    "val": "./datasets/correction_val.jsonl",
}

In [9]:
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128

In [10]:
DATASET_NAME = "spell_correction_dataset"
BATCH_SIZE = 16
EPOCHS = 10
ACCUMULATION_STEPS = 1
LR = 2e-3
LOGGING_STEPS = 10

# Load Dataset

In [11]:
from utils.tokenizer import TokenizerPreprocessor

In [12]:
raw_datasets = load_dataset("json", data_files=SPLIT_CONFIG)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, legacy=True)

In [14]:
tp = TokenizerPreprocessor(
    tokenizer=tokenizer,
    max_input_length=MAX_INPUT_LENGTH,
    max_target_length=MAX_TARGET_LENGTH,
)

In [15]:
tokenized_datasets = raw_datasets.map(tp, batched=True)

# Configure Trainer

In [16]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
)

In [17]:
def model_init():
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
    return model

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [19]:
from utils.metrics import CustomTrainer

In [20]:
training_args = Seq2SeqTrainingArguments(
    f"./results/hparams_tuner_{time.time()}",
    run_name=f"{MODEL_NAME}-trainer-{datetime.now()}",
    evaluation_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="epoch",
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=False,
    logging_steps=LOGGING_STEPS,
    push_to_hub=False,
    load_best_model_at_end=True,
    logging_strategy="steps",
    gradient_accumulation_steps=ACCUMULATION_STEPS,
    report_to=["wandb"],
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    dataloader_pin_memory=True,
)

In [21]:
trainer = CustomTrainer(
    None,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    model_init=model_init,
    show_extra_metrics=False,
)

# Train Model

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


# Export

###  pt

In [None]:
%%time
# trainer.model.save_pretrained("./model_artifacts/exported_model_pt", safe_serialization=True)
trainer.model.save_pretrained(
    "./model_artifacts/exported_model_pt", safe_serialization=True
)

In [None]:
trainer.tokenizer.save_pretrained("./model_artifacts/exported_model_pt")

### onnx

In [34]:
# %%time
# !python -m transformers.onnx --model ./model_artifacts/exported_model_pt ./model_artifacts/onnx/