In [1]:
CACHE_DIR = "/data/ondovbd/.cache/huggingface/transformers/"
MODEL = 't5-base'

In [2]:
from datasets import Dataset
import pandas as pd

df_train = pd.read_csv('cloth-train.csv')#[:1000]
df_val = pd.read_csv('cloth-valid.csv')[:1000]
train_ds = Dataset.from_pandas(df_train, split="train")
val_ds = Dataset.from_pandas(df_val, split="val")

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained(MODEL, cache_dir=CACHE_DIR, model_max_length=512)
model = T5ForConditionalGeneration.from_pretrained(MODEL, cache_dir=CACHE_DIR)

In [4]:
max_input_length = 512
max_target_length = 30

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["source"],
        max_length=max_input_length,
        truncation=True,
        padding=True,
    )
    labels = tokenizer(
        examples["target"],
        max_length=max_target_length,
        truncation=True,
        padding=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [5]:
import pickle

tokenized_train = train_ds.map(preprocess_function, batched=True)#.with_format("torch", device=device)
tokenized_val = val_ds.map(preprocess_function, batched=True)#.with_format("torch", device=device)

import pickle
pickle.dump(tokenized_train, open('cloth-train.pkl', 'wb'))
pickle.dump(tokenized_val, open('cloth-valid.pkl', 'wb'))

In [6]:
ds_train=pickle.load(open('cloth-train.pkl','rb'))
ds_valid=pickle.load(open('cloth-valid.pkl','rb'))

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [8]:
from transformers import Seq2SeqTrainingArguments

batch_size=64
epochs=30

args = Seq2SeqTrainingArguments(
    output_dir="text2text-t5-base",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
#    weight_decay=0.01,
    save_total_limit=10,
    num_train_epochs=30,
#    predict_with_generate=True,
#    logging_steps=logging_steps,
#    push_to_hub=True,
)

In [9]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
#    compute_metrics=compute_metrics,
)

In [None]:
trainer.train(resume_from_checkpoint=False)



Epoch,Training Loss,Validation Loss


In [5]:
from transformers import pipeline
translator = pipeline("text2text-generation", model='text2text-t5-base/checkpoint-72000')

In [30]:
translator("Mrs Brown is from London in  the UK  , but she  _  living in China. [SEP] likes")

[{'generated_text': 'is likes liking'}]

In [27]:
translator("Everyone has their own dreams. They give us courage and confidence to keep us going through difficulties. Here is some advice on how to realize our beautiful dreams. [SEP] pleasant")

[{'generated_text': 'sad bad happy'}]