In [1]:
#!pip uninstall huggingface_hub -y && pip install git+https://github.com/huggingface/huggingface_hub
#!pip install jiwer

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, Dataset
from tqdm import tqdm
import datasets

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True,nb_workers=8)
tqdm.pandas()

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer, 
    TrainingArguments, Trainer, DataCollatorForSeq2Seq,AutoTokenizer,FlaxT5ForConditionalGeneration,AutoModelForSeq2SeqLM
  )


In [4]:
model_name = 'flax-community/bengali-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/944M [00:00<?, ?B/s]

In [5]:
train_data = datasets.Dataset.load_from_disk('/kaggle/input/t5-asr-corrector-masked-third-attempt-dat/train')
test_data  = datasets.Dataset.load_from_disk('/kaggle/input/t5-asr-corrector-masked-third-attempt-dat/test')

In [6]:
#test_data[0]

In [7]:
test_data.save_to_disk('test')
validation = datasets.Dataset.load_from_disk('test')
validation = validation.train_test_split(0.0625)
validation = validation['test']
validation

Dataset({
    features: ['input', 'error_word', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 12759
})

In [8]:
train_data.cleanup_cache_files()
validation.cleanup_cache_files()

1

In [9]:
train_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1837181
})

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

In [11]:
!mkdir -p t5-asr-corrector-bn

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)


In [12]:
from kaggle_secrets import UserSecretsClient
import wandb
import logging

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api")

wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [13]:
# defining training related arguments
batch_size = 32
args = TrainingArguments(output_dir="t5-asr-corrector-bn-masked",
                        evaluation_strategy="steps",
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        learning_rate=1e-5,
                        num_train_epochs=2,
                        weight_decay=0.01,
                        save_total_limit=2,
                        fp16 = True,
                        gradient_accumulation_steps = 4,
                        eval_steps = 1000,
                        save_steps = 1000,
                        load_best_model_at_end=True,
                        logging_dir="/kaggle/logs",
                        push_to_hub=False,
                       )
                                         

In [14]:
validation

Dataset({
    features: ['input', 'error_word', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 12759
})

In [15]:
# defining trainer using 🤗
trainer = Trainer(model=model, 
                args=args, 
                train_dataset= train_data,
                eval_dataset=validation,
                tokenizer=tokenizer,
                data_collator=data_collator,
                )

Using cuda_amp half precision backend


In [16]:
trainer.train()

***** Running training *****
  Num examples = 1837181
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 4
  Total optimization steps = 14352
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mmushrafi88[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
1000,0.1684,0.15765
2000,0.145,0.133346
3000,0.1272,0.114612
4000,0.1142,0.10121
5000,0.1049,0.091804
6000,0.0974,0.084531
7000,0.0917,0.079071
8000,0.0864,0.074968
9000,0.0835,0.071651
10000,0.0809,0.069227


The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: error_word, output, input, __index_level_0__. If error_word, output, input, __index_level_0__ are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12759
  Batch size = 64
Saving model checkpoint to t5-asr-corrector-bn-masked/checkpoint-1000
Configuration saved in t5-asr-corrector-bn-masked/checkpoint-1000/config.json
Model weights saved in t5-asr-corrector-bn-masked/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-asr-corrector-bn-masked/checkpoint-1000/tokenizer_config.json
Special tokens file saved in t5-asr-corrector-bn-masked/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: error_word, output, input,

TrainOutput(global_step=14352, training_loss=0.1314307711727776, metrics={'train_runtime': 32853.7953, 'train_samples_per_second': 111.84, 'train_steps_per_second': 0.437, 'total_flos': 3.144753711412347e+17, 'train_loss': 0.1314307711727776, 'epoch': 2.0})

In [17]:
trainer.save_model('t5_asr_bn_corrector_third_attempt')

Saving model checkpoint to t5_asr_bn_corrector_third_attempt
Configuration saved in t5_asr_bn_corrector_third_attempt/config.json
Model weights saved in t5_asr_bn_corrector_third_attempt/pytorch_model.bin
tokenizer config file saved in t5_asr_bn_corrector_third_attempt/tokenizer_config.json
Special tokens file saved in t5_asr_bn_corrector_third_attempt/special_tokens_map.json
