In [1]:
!pip uninstall huggingface_hub -y && pip install git+https://github.com/huggingface/huggingface_hub
!pip install jiwer

Found existing installation: huggingface-hub 0.10.1
Uninstalling huggingface-hub-0.10.1:
  Successfully uninstalled huggingface-hub-0.10.1
[0mCollecting git+https://github.com/huggingface/huggingface_hub
  Cloning https://github.com/huggingface/huggingface_hub to /tmp/pip-req-build-rt3r7cze
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/huggingface_hub /tmp/pip-req-build-rt3r7cze
  Resolved https://github.com/huggingface/huggingface_hub to commit 2f4e35a1e666e3ff816cbc676a2035a4509cb9bc
  Installing build dependencies ... [?25l- \ | / - done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Building wheels for collected packages: huggingface-hub
  Building wheel for huggingface-hub (pyproject.toml) ... [?25l- \ done
[?25h  Created wheel for huggingface-hub: filename=huggingface_hub-0.13.0.dev0-py3-none-any.whl size=189327 sha256=b80b6dcabc61f5

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, Dataset
from tqdm import tqdm
import datasets

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True,nb_workers=8)
tqdm.pandas()

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer, 
    TrainingArguments, Trainer, DataCollatorForSeq2Seq,AutoTokenizer,FlaxT5ForConditionalGeneration,AutoModelForSeq2SeqLM
  )


In [4]:
model_name = 'flax-community/bengali-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/944M [00:00<?, ?B/s]

In [5]:
train_data = datasets.Dataset.load_from_disk('/kaggle/input/t5-asr-corrector-raw-first-attempt-data/train')
test_data  = datasets.Dataset.load_from_disk('/kaggle/input/t5-asr-corrector-raw-first-attempt-data/test')

In [6]:
test_data.save_to_disk('test')
validation = datasets.Dataset.load_from_disk('test')
validation = validation.train_test_split(0.0625)
validation = validation['test']
validation

Dataset({
    features: ['input', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 12759
})

In [7]:
train_data.cleanup_cache_files()
validation.cleanup_cache_files()

1

In [8]:
train_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1837181
})

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

In [10]:
!mkdir -p t5-asr-corrector-bn

In [11]:
from kaggle_secrets import UserSecretsClient
import wandb
import logging

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api")

wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [12]:
# defining training related arguments
batch_size = 32
args = TrainingArguments(output_dir="t5-asr-corrector-bn",
                        evaluation_strategy="steps",
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        learning_rate=1e-5,
                        num_train_epochs=2,
                        weight_decay=0.01,
                        save_total_limit=2,
                        fp16 = True,
                        gradient_accumulation_steps = 4,
                        eval_steps = 1000,
                        save_steps = 1000,
                        load_best_model_at_end=True,
                        logging_dir="/kaggle/logs",
                        push_to_hub=False,
                       )
                                         

In [13]:
validation

Dataset({
    features: ['input', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 12759
})

In [14]:
from transformers import EarlyStoppingCallback
from datasets import load_metric

early_stop = EarlyStoppingCallback(2, 1.0)


wer_metric = load_metric("wer")
cer_metric = load_metric("cer")



Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [15]:
# defining trainer using 🤗
trainer = Trainer(model=model, 
                args=args, 
                train_dataset= train_data,
                eval_dataset=validation,
                tokenizer=tokenizer,
                data_collator=data_collator,
                )

Using cuda_amp half precision backend


In [16]:
trainer.train()

***** Running training *****
  Num examples = 1837181
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 4
  Total optimization steps = 14352
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mmushrafi88[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
1000,0.6664,0.472163
2000,0.3416,0.251074
3000,0.2389,0.183318
4000,0.191,0.152046
5000,0.1673,0.135286
6000,0.152,0.12475
7000,0.1414,0.116754
8000,0.1332,0.110878
9000,0.1268,0.106517
10000,0.1227,0.10324


The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: __index_level_0__, output, input. If __index_level_0__, output, input are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12759
  Batch size = 64
Saving model checkpoint to t5-asr-corrector-bn/checkpoint-1000
Configuration saved in t5-asr-corrector-bn/checkpoint-1000/config.json
Model weights saved in t5-asr-corrector-bn/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-asr-corrector-bn/checkpoint-1000/tokenizer_config.json
Special tokens file saved in t5-asr-corrector-bn/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: __index_level_0__, output, input. If __index_level_0__, output, input are not expecte

TrainOutput(global_step=14352, training_loss=0.24435510337684996, metrics={'train_runtime': 33704.3848, 'train_samples_per_second': 109.017, 'train_steps_per_second': 0.426, 'total_flos': 3.144680106337567e+17, 'train_loss': 0.24435510337684996, 'epoch': 2.0})

In [17]:
trainer.save_model('t5_asr_bn_corrector_first_attempt')

Saving model checkpoint to t5_asr_bn_corrector_first_attempt
Configuration saved in t5_asr_bn_corrector_first_attempt/config.json
Model weights saved in t5_asr_bn_corrector_first_attempt/pytorch_model.bin
tokenizer config file saved in t5_asr_bn_corrector_first_attempt/tokenizer_config.json
Special tokens file saved in t5_asr_bn_corrector_first_attempt/special_tokens_map.json
