In [3]:
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk
import numpy as np

In [6]:
MODEL_NAME = "dumitrescustefan/t5-v1_1-base-romanian"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.generation_config.min_new_tokens = 0
model.generation_config.max_new_tokens = 256
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [8]:
model.resize_token_embeddings(len(tokenizer))

Embedding(64101, 768)

In [4]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

In [6]:
from datasets import load_dataset

dataset = load_dataset("mateiaassAI/MEID3_v2", split=['train[:97%]', 'train[97%:100%]'])
dataset

Downloading data: 100%|███████████████████████████████████████████████████████████| 7.64M/7.64M [00:01<00:00, 3.96MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████| 11.0M/11.0M [00:01<00:00, 6.51MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████| 10.6M/10.6M [00:01<00:00, 6.55MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████| 13.7M/13.7M [00:01<00:00, 7.43MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████| 10.7M/10.7M [00:01<00:00, 7.11MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████| 10.2M/10.2M [00:01<00:00, 5.66MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████| 11.3M/11.3M [00:01<00:00, 7.38MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

[Dataset({
     features: ['wrong', 'right'],
     num_rows: 322306
 }),
 Dataset({
     features: ['wrong', 'right'],
     num_rows: 9968
 })]

In [12]:
ds_train = dataset[0]
ds_test = dataset[1]
ds_test

Dataset({
    features: ['wrong', 'right'],
    num_rows: 60857
})

In [13]:
punctuation_marks = ['.', '?', '!', ';', '...']

def filter_sentences(sentences):
    text = sentences['right']
    if any(text.endswith(punc) for punc in punctuation_marks):
      words = text.split()
      if len(words) >= 10:
        return True
    return False

In [14]:
fds_train = ds_train.filter(filter_sentences, batched=False)
# ds_test = ds_test.take(30000)
fds_test = ds_test.filter(filter_sentences, batched=False)

Filter:   0%|          | 0/1967695 [00:00<?, ? examples/s]

Filter:   0%|          | 0/60857 [00:00<?, ? examples/s]

In [15]:
print(len(fds_train))
print(len(fds_test))

1007018
31457


In [16]:
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["wrong"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(text_target=examples["right"],
                          max_length=256,
                          truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [39]:
tokenized_dataset = fds_train.select(range(500000)).map(preprocess_function, batched=True)
# tokenized_dataset = fds_train.select(range(100)).map(preprocess_function, batched=True)
tokenized_dataset_test = fds_test.select(range(20000)).map(preprocess_function, batched=True)
print(len(tokenized_dataset_test))

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

10000


In [40]:
print(len(tokenized_dataset[0]['input_ids']))
print(len(tokenized_dataset[0]['labels']))
# print(len(tokenized_dataset[0]['input_ids']))
print(tokenized_dataset[0]['labels'])
print(tokenized_dataset[0]['attention_mask'])
print(tokenized_dataset[0]['input_ids'])

102
107
[183, 988, 24, 485, 7568, 8241, 6, 3, 3848, 763, 3, 5, 2048, 306, 7, 2069, 19, 784, 245, 7, 3255, 3, 191, 15328, 234, 37453, 5, 26, 1119, 6, 15117, 945, 4, 12, 8213, 313, 6, 3375, 245, 7, 5856, 19, 3, 262, 674, 4550, 3, 5, 137, 50, 13340, 1214, 72, 23394, 9, 20654, 6, 9474, 11, 4806, 4, 4374, 3231, 4, 19, 9988, 4, 6, 1250, 6, 3235, 2853, 8, 125, 10, 50, 80, 5, 2771, 31135, 4, 51610, 1307, 3, 191, 3, 365, 4, 12, 9759, 245, 7, 3051, 8877, 262, 3, 191, 1199, 169, 3, 191, 3780, 48, 367, 13580, 7, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[183, 988, 24, 5, 32763, 7568, 8241, 6, 3, 3848, 763, 3, 5, 2048, 306, 7, 2069, 19, 8901, 245, 7, 3255, 3, 191, 15328, 234, 37453, 5, 26, 1119, 6, 15117, 945, 12, 8213, 313, 6, 3375, 2

In [43]:
# # !pip install accelerate -U
# import gc
# import torch

# del trainer
# gc.collect()
# torch.cuda.empty_cache()

In [44]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 1

model.to("cuda")

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./kaggle/working/results",
   evaluation_strategy="steps",
   eval_steps=6250,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False,
    save_strategy="steps",
    save_steps=31250,
#     logging_steps=100,  # Log training loss every 100 steps
#     logging_dir=None,  # No logging directory, print to console
   report_to='none')

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset,
   eval_dataset=tokenized_dataset_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
#    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [45]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 