# Prepare Experiment & Deepspeed config (**MANDATORY**)
***

In [None]:
ds_config = {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": True,
        "allgather_bucket_size": 5e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 5e8,
        "contiguous_gradients": True,
        "offload_optimizer": {
            "device": "none",
        },
        "offload_params": {
            "device": "none"
        },
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 200,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": False
}

training_args = {
    "do_train": True,
    "do_eval": True,
    "num_train_epochs": 4,
    "gradient_accumulation_steps": 1,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 256,
    "fp16": True,
    "weight_decay": 0.0,
    "warmup_steps": 0,
    "learning_rate": 1e-5,
    "logging_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": False,
    "metric_for_best_model": "eval_accuracy",
    "greater_is_better": True,
}

# usually overriden by external config:
num_gpus = 1
model_name ="bert-base-uncased"
block_size = 128
logdir = "data/models/bert-base-uncased/contra_ms/"
override_logdir = True
dataset_folder = "data/contrastive_moral_stories/anti_ms_splits_only/action+norm/norm_distance/"
load_pretrained_weights = True
from_checkpoint = None
deepspeed = False
action_only=False

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["NCCL_P2P_DISABLE"] = "1"
#os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
if not deepspeed:
    pass
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import numpy as np
import pandas as pd
from datasets import load_dataset
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import datasets
from transformers import Trainer, TrainingArguments

pd.set_option('display.max_colwidth', 400)

# Tokenize the dataset
***

In [None]:
training_args = TrainingArguments(
    output_dir=logdir,
    overwrite_output_dir=override_logdir,
    logging_dir=logdir,
    deepspeed= logdir + "/ds_config.json" if deepspeed else None,
    report_to="tensorboard",
    **training_args
)

In [None]:
if "Eleuther" in model_name:
    tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token='<|startoftext|>', 
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
t = pd.read_json(dataset_folder+"train.jsonl", lines=True)
t.head(1)

In [None]:
def load_action_norm_split(path):
    train, dev, test = [pd.read_json(f"{path}{x}.jsonl", lines=True) for x in ["train", "dev", "test"]]

    # construct dataframes that can actually be used
    assign_action = lambda x: x["moral_action"] if x["label"] == 1 else x["immoral_action"]
    train["action"] = train.apply(assign_action, axis=1)
    dev["action"] = dev.apply(assign_action, axis=1)
    test["action"] = test.apply(assign_action, axis=1)

    subset = ["flipped-rot-judgment", "action", "label"]
    train = train[subset]
    dev = dev[subset]
    test = test[subset]
    return train, dev, test

train, dev, test = load_action_norm_split(dataset_folder)

dataset = datasets.DatasetDict()
dataset["train"] = datasets.Dataset.from_pandas(train)
dataset["dev"] = datasets.Dataset.from_pandas(dev)
dataset["test"] = datasets.Dataset.from_pandas(test)

In [None]:
def tokenize(samples):
    if action_only:
        return tokenizer(samples["action"], truncation=True, padding="max_length", max_length=block_size)
    else:
        return tokenizer(samples["action"], samples["flipped-rot-judgment"], truncation=True, padding="max_length", max_length=block_size)

tokenized_data = dataset.map(tokenize, batched=True, batch_size=10000).shuffle()
#tokenized_data.save_to_disk("data/tokenized_data/")

# Load the model

In [None]:
if from_checkpoint is None:
    path = model_name
else:
    if from_checkpoint in {"first", "last"}:
        ckpts = [x for x in os.listdir(logdir) if x.startswith("checkpoint")]
        ckpts = sorted(ckpts, key=lambda x: int(x.split("-")[1]))
        ckpt = ckpts[0 if from_checkpoint == "first" else -1]
        print("ATTEMPTING TO LOAD CHECKPOINT", os.path.join(logdir, ckpt))
        path = os.path.join(logdir, ckpt)
    elif from_checkpoint == "no-pretraining":
        # TODO
        raise NotImplementedError()
    else:
        path = from_checkpoint

model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=2)


# Prepare Trainer
***

In [None]:
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"] if training_args.do_train else None,
    eval_dataset=tokenized_data["dev"] if training_args.do_train else None,
    compute_metrics=compute_metrics,
)

In [None]:
if training_args.do_train:
    trainer.train()

In [None]:
if training_args.do_eval:
    for split, data in tokenized_data.items():
        r = trainer.evaluate(data, metric_key_prefix=f"test_{split}")