In [None]:
!pip install transformers datasets accelerate torch tokenizers > /dev/null

In [None]:
!git clone https://huggingface.co/  #Add username/model_name only if resuming training

In [None]:
!ls t5-obsrvr/last-checkpoint/ # only if resuming training

In [None]:
!ls /kaggle/input/obsrvr-data/

In [None]:
from datasets import load_dataset

BASE = "/kaggle/input/obsrvr-data/"

dataset = load_dataset("json", data_files={"data": BASE + "data_file.json"}) # here replace data_file with your file

dataset = dataset["data"].train_test_split(test_size=0.15) 

train_val_split = dataset["train"].train_test_split(test_size=0.15)  
dataset["train"] = train_val_split["train"]
dataset["validation"] = train_val_split["test"]

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small" # either t5-small or t5-base if new training else username/model_name
tokenizer_name = "t5-small" # either t5-small or t5-base depending on model you chose

tokenizer = T5Tokenizer.from_pretrained(tokenizer_name, cache_dir=None)
model = T5ForConditionalGeneration.from_pretrained(model_name, cache_dir=None)

In [None]:
def preprocess_function(examples):

    inputs = [ex for ex in examples["input"]]
    outputs = [ex for ex in examples["output"]]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset.map(preprocess_function, batched=True)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./out_dir",
    run_name="OBSRVR_run1",
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=500,
    learning_rate=2e-5, # you can change here
    warmup_steps=500,
    logging_first_step=True,
    per_device_train_batch_size=4,  # you can change here
    per_device_eval_batch_size=4,
    num_train_epochs =5,   # you can change here
    weight_decay=0.01,  # you can change here
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,  # set True to save at your huggingface account
    hub_model_id="username/model_name", #username/model_name
    hub_strategy="checkpoint",
    metric_for_best_model="loss",
    resume_from_checkpoint=False # set to True to continue training if stopped before
)

In [None]:
from transformers import Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator
)

In [None]:
import wandb

wandb.init(project="OBSRVR", name="OBSRVR_run1", config={}, mode="offline")

In [None]:
trainer.train()  # add : resume_from_checkpoint="./out_dir/last-checkpoint" to continue training if stopped
#replace ./out_dir with what you chose above in model params