# Prepare Experiment & Deepspeed config (**MANDATORY**)
***

In [53]:
ds_config = {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 24,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": True,
        "allgather_bucket_size": 5e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 5e8,
        "contiguous_gradients": True,
        "offload_optimizer": {
            "device": "none",
        },
        "offload_params": {
            "device": "none"
        },
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 200,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": False
}

training_args = {
    "save_steps": 100,
    "logging_steps": 50,
    "num_train_epochs": 10,
    "gradient_accumulation_steps": 1,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 64,
    "fp16": True,
    "weight_decay": 0.0,
    "warmup_steps": 0,
    "learning_rate": 1e-5,
    "evaluation_strategy": "epoch",
    "save_total_limit": 2,
    "lr_scheduler_type": "cosine"
}

num_gpus = 1
model_name ="t5-small"
#model_name = "EleutherAI/gpt-neo-2.7B"
#model_name = "EleutherAI/gpt-j-6B"
logdir = "data/models/t5-small/rot-splitter/"
override_logdir = True
block_size = 128
out_token = "[ROT]"

In [77]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import datasets
from tqdm import tqdm
import nltk

pd.set_option('display.max_colwidth', 400)


# Tokenize the dataset
***

In [55]:
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=block_size)
tokenizer.add_special_tokens({"additional_special_tokens":[out_token]})

1

input_ids = tokenizer(["Hello [ROT] Again"])["input_ids"]
print(input_ids)
tokenizer.decode(input_ids[0], skip_special_tokens=False)

tokenizer.decode([7204, 1], skip_special_tokens=True)

In [56]:
from sklearn.model_selection import train_test_split

def load_social_chem101():
    a = pd.read_csv("data/social-chem-101/social-chem-101.v1.0.tsv", sep="\t")
    return a

social_chem = load_social_chem101()
#social_chem = social_chem[social_chem["split"] == "train"]
social_chem = social_chem.dropna(subset=["rot-categorization", "rot-judgment", "action", "rot-agree"])
social_chem = social_chem[social_chem["rot-agree"] >= 3.0]
social_chem = social_chem[social_chem["rot-bad"] == 0]
social_chem = social_chem[social_chem["rot-categorization"].apply(lambda x: "morality-ethics" in x or "social-norms" in x)]
social_chem = social_chem[social_chem["rot-judgment"].apply(lambda x: "{" not in x)]
social_chem = social_chem[social_chem.apply(lambda x: max(len(x["rot"]), len(x["action"]) + len(x["rot-judgment"])) <= block_size, axis=1)]
social_chem = social_chem[["action", "rot-judgment", "rot"]].groupby("rot", as_index=False).nth(0)

train, dev = train_test_split(social_chem, test_size=0.1, random_state=42)

In [57]:
train_data = datasets.Dataset.from_pandas(train).shuffle()
dev_data = datasets.Dataset.from_pandas(dev).shuffle()

In [58]:
def tokenize_input(samples):
    inp = tokenizer(samples["rot"], truncation=True, padding="max_length", max_length=block_size)
    with tokenizer.as_target_tokenizer():
        x = [judgment + " " + out_token + " " +  action for judgment, action in zip(samples["rot-judgment"], samples["action"])]
        out = tokenizer(x, truncation=True, padding="max_length", max_length=block_size)
    inp["labels"] = out["input_ids"]
    return inp

tokenized_train = train_data.map(tokenize_input, batched=True, batch_size=10000)
tokenized_dev = dev_data.map(tokenize_input, batched=True, batch_size=10000)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

# Load the model

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Prepare Trainer
***

In [9]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

import torch

data_collator = DataCollatorForSeq2Seq(tokenizer, model)

training_args = Seq2SeqTrainingArguments(
    output_dir=logdir,
    overwrite_output_dir=override_logdir,
    logging_dir=logdir,
    deepspeed= logdir + "/ds_config.json",
    report_to="tensorboard",
    predict_with_generate=True,
    **training_args
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
)
trainer.train()

In [59]:
model = AutoModelForSeq2SeqLM.from_pretrained("data/models/t5-small/rot-splitter/bs128_lr_0_0001/checkpoint-1100/").cuda()

In [91]:
k = [66880, 87715, 267810, 92868]
sample = dev.sample(10).copy()
sample = dev.loc[k]
x = tokenizer(sample["rot"].to_list(), padding="max_length", return_tensors="pt")
x = {k:v.cuda() for k,v in x.items()}
y = model.generate(**x, min_length=1, max_length=128, top_p=0.95, top_k=50, 
                   num_beams=10, temperature=1, force_words_ids=[tokenizer.additional_special_tokens_ids])
sample["split"] = tokenizer.batch_decode(y, skip_special_tokens=False)
sample[["action", "rot-judgment", "split"]]

Unnamed: 0,action,rot-judgment,split
66880,not inviting all of their family to an event can appear as petty or disloyal to the rest of their family.,refrain from,<pad> can appear as petty or disloyal to the rest of their family.[ROT] People who don't invite all of their family to an event</s>
87715,appreciating the life you have,You should,<pad> you should[ROT] appréciing the life you have</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
267810,being grateful when someone helps to save the company you work for.,It's understandable,<pad> It's understoodable[ROT] being grateful when someone helps to save the company you work for.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
92868,being angry when your ex best friend ruins your life.,It's understandable,<pad> It's understoodable[ROT] being angry when your ex best friend ruins your life.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
