In [3]:
import transformers
from transformers import (
    MT5ForConditionalGeneration,
    Seq2SeqTrainer, MT5Tokenizer, MT5Config
)

import datasets
import pandas as pd
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
from datasets import load_metric
import gc
import datasets
import os
import torch

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_DISABLED"] = "true"
!export CUDA_VISIBLE_DEVICES=0
device, use_gpu = ("cuda:0", True) if torch.cuda.is_available() else ("cpu", False)

In [2]:
# !pip install datasets
# !pip install --upgrade accelerate

In [4]:
import json
checkpoint = "VietAI/vit5-base"
model = MT5ForConditionalGeneration.from_pretrained(checkpoint)
print('load model done')
tokenizer = MT5Tokenizer.from_pretrained(checkpoint)
print('load tokenizer done')

You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.


load model done
load tokenizer done


In [24]:
PATHS = ["data/question_full.txt","data/sentence_full.txt"]
data = []
cnt  = 0
for path in PATHS:
    with open(path,'r') as f:
        for line in f:
            line = line.split('\t')
            if len(line) !=6 :
                cnt += 1
                continue
            data.append(
            {
                'src': line[0],
                'tgt': line[1]
            })
            data.append(
            {
                'src': line[1],
                'tgt': line[2]
            })
            data.append(
            {
                'src': line[2],
                'tgt': line[3]
            })
            data.append(
            {
                'src': line[4],
                'tgt': line[5]
            })
print(f'error {cnt}')

print(f'total size of data is {len(data)}')


error 51
total size of data is 1465800


In [26]:
data[1050000]

{'question': 'Thuyết tiến hóa có giải thích được tại sao có nhiều loại thực vật không?',
 'sql': 'Sự đa dạng của các loài thực vật được giải thích bằng thuyết tiến hóa?'}

In [27]:
tdata = pd.DataFrame(data)
tdata = tdata.reset_index()
dataset = datasets.Dataset.from_pandas(tdata)

train = dataset.train_test_split(
    train_size=1463000, test_size=2800, seed=42
)

train_data = train['train']
test_data = train['test']

In [28]:
def format_dataset(example):
     return {'input': example['src'], 'target': example['tgt']}
train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)
test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)

def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=128)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=128)
    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)

Map:   0%|          | 0/1463000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1463000 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

In [None]:
# !pip install nltk rouge_score

In [29]:
from datasets import load_metric
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

  rouge = load_metric("rouge")


In [30]:
data_collator = DataCollatorForSeq2Seq(tokenizer,model=model)
training_args = Seq2SeqTrainingArguments(
    output_dir="viT5-base-1",
    per_device_train_batch_size=16,
    num_train_epochs=2,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    evaluation_strategy="steps",
    do_train=True,
    do_eval=True,
    logging_steps=22859,
    save_strategy="steps",
    save_steps=45718,
    eval_steps=22859,
    overwrite_output_dir=True,
    save_total_limit=4,
    load_best_model_at_end=True,
    report_to=None,
     group_by_length=True,
    #fp16=True, 
)
trainer = Seq2SeqTrainer(
    model=model,
    data_collator = data_collator,
    tokenizer = tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
22859,0.455,0.234621,0.5088,0.376,0.4155
45718,0.2506,0.217037,0.5235,0.3866,0.428
