# Prompting masked lms
***

In [None]:
batch_size = 64
num_gpus = 1
model_name = "gpt2"
logdir = "data/models/tests/"
prompt_dir = "data/prompts/topics/"
from_checkpoint = None #"data/models/masked_classification/contrastive-moral-stories/gpt2/bs32_lr_1e-05/"
# whether from_checkpoints points to a directory of multiple checkpoints for the same architecture
# if True, this script will load the weights consecutively without creating the model again for each of the state_dicts
# This saves a lot of time.
# Note: `from_checkpoint` is expected to point to a dir of dirs, each of which are valid arguments as singular runs
#multi_checkpoints = False
override_logdir = True

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

#os.environ["CUDA_VISIBLE_DEVICES"] = "8"

import numpy as np
import torch
import pandas as pd
from datasets import load_dataset
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import datasets
from social_chem import load_ms_soc_joined
import fastmodellib as fml
from torch.utils.tensorboard import SummaryWriter

pd.set_option('display.max_colwidth', 400)

In [None]:
training_args = TrainingArguments(
    output_dir=logdir,
    overwrite_output_dir=override_logdir,
    logging_dir=logdir,
    report_to="tensorboard",
    #include_inputs_for_metrics=True,
    per_device_eval_batch_size=batch_size,
    #eval_accumulation_steps=32,
    fp16=True,
    do_train=False,
    do_eval=True,
    do_predict=True,
)

## Preparing args
***

In [None]:
# find checkpoint
import pathlib
if from_checkpoint is not None:
    print("Checkpoint given:", from_checkpoint)
    if fml.persistence.is_checkpoint_dir(from_checkpoint):
        checkpoints = [from_checkpoint]
        print("Checkpoint was found", checkpoints)
    else:
        p = pathlib.Path(from_checkpoint)
        checkpoints = [str(x) for x in p.glob("checkpoint-*") if fml.persistence.is_checkpoint_dir(x)]
        print("Found checkpoints in subdirectories:", checkpoints)
    if len(checkpoints) == 0:
        raise ValueError(f"Found no checkpoint in dir '{from_checkpoint}'")
else:
    checkpoints = [None]

# ensure checkpoints are a list or None
if multi_checkpoints:
    if from_checkpoint is None:
        raise ValueError("Need a valid directory for parameter `from_checkpoint`")
    if isinstance(from_checkpoint, str):
        # extract paths
        checkpoints = fml.persistence.find_checkpoints(from_checkpoint)
    elif isinstance(from_checkpoint, list):
        checkpoints = from_checkpoint
else:
    # assume single checkpoint
    checkpoints = [from_checkpoint]

## Loading model + tokenizer
***

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# set tokenizer padding to right!
# looking at you, llama
tokenizer.padding_side = "right"

In [None]:
# construct the model with the first checkpoint
model = fml.load_model(model_name=model_name, from_checkpoint=checkpoints[0], load_pretrained_weights=True, model_class=AutoModelForCausalLM)

# Loading data
***

In [None]:
def load_opinion_lexicon():
    with open("data/opinion-lexicon-English/negative-words.txt", encoding="latin1") as f:
        lines = f.readlines()
    lines = [x.strip() for x in lines if not x.startswith(";")]
    negative = [x for x in lines if len(x) > 0]
    with open("data/opinion-lexicon-English/positive-words.txt", encoding="latin1") as f:
        lines = f.readlines()
    lines = [x.strip() for x in lines if not x.startswith(";")]
    positive = [x for x in lines if len(x) > 0]
    return positive, negative

In [None]:
positive, negative = load_opinion_lexicon()

# We expect gpt like models to generate whole words with a whitespace in front
# Empirically, gpt like models have very few non-whitespace words
# llama, however, behaves much more like bert-like models, i.e. it no words
# with whitespaces in front
if "llama" not in model_name:
    positive = [" " + x for x in positive]
    negative = [" " + x for x in negative]

In [None]:
pos_enc = {p:t for p,t in zip(positive, tokenizer(positive, add_special_tokens=False)["input_ids"]) if len(t) == 1}
neg_enc = {p:t for p,t in zip(negative, tokenizer(negative, add_special_tokens=False)["input_ids"]) if len(t) == 1}

pos_ids = sum(pos_enc.values(), [])
neg_ids = sum(neg_enc.values(), [])

all_ids = pos_ids + neg_ids

print("Positive words:", len(pos_ids))
print("Negative words:", len(neg_ids))

### Loading prompts
***

In [None]:
prompt_files = [x for x in os.listdir(prompt_dir) if x.endswith(".jsonl")]
dataset = datasets.DatasetDict()
pos_label_word = next(iter(pos_enc.keys()))
neg_label_word = next(iter(neg_enc.keys()))

for pf in prompt_files:
    d = pd.read_json(prompt_dir + pf, orient="records", lines=True)

    # 1: norm has positive moral judgment, 0 negative
    d["original_label"] = (d["action-moral-judgment"] > 0).astype("int32")

    # find all prompts, that end with a mask
    mask_end = d["prompt"].map(lambda x: x.endswith("[MASK].\"") or x.endswith("[MASK]."))
    
    d = d[mask_end]
    
    if len(d) == 0: continue

    print(pf)
    d["prompt"] = d["prompt"].apply(lambda x: x.removesuffix("[MASK].\"").removesuffix("[MASK]."))
    # we create artificial text targets with a random positive or negative word.
    # this way, we can infer whether an input should have been a positive or a negative norm during metric computation
    d["prompt"] += d["original_label"].map(lambda x: pos_label_word.strip() if x == 1 else neg_label_word.strip())

    dataset[os.path.splitext(pf)[0]] = datasets.Dataset.from_pandas(d)

print(f"Loaded {len(dataset)} prompt tasks")

In [None]:
def tokenize(samples):
    return tokenizer(samples["prompt"], padding=False)

tokenized_data = dataset.map(tokenize, batched=True, batch_size=1000)

In [None]:
from datasets import load_metric
import torch

def compute_metrics(eval_pred):
    probs = torch.tensor(eval_pred.predictions)
    labels = torch.tensor(eval_pred.label_ids)

    '''
    Short explanation:
    We input our left-to-right models with full prompts, that is, the desired answer is included in the input ids.
    But: These models do not have access to future tokens to predict the current one. Therefore, this is safe in terms of training.
    For the evaluation we need two infos: Which token would have been correct and what did the model predict 
    The first can be gathered from the label ids by finding the word before the collator padding (-100).
    To find the model prediction we need to move one index to left!     
    '''
    # we need to find the indices of the last predicted word
    # these differ across samples due to right padding...
    # here, we assume that the collator padded sentences with -100
    if tokenizer.padding_side == "right":
        label_pos = (labels == -100).float().argmax(-1)
        label_pos[label_pos == 0] = labels.shape[-1]
        label_pos = label_pos - 1
        r = range(label_pos.shape[0])
        y_pred = probs[r,label_pos-1]
        correct_ids = labels[r, label_pos]
    else:
        label_pos = torch.zeros(labels.shape[0], dtype=torch.long)

    # if it is not a positive word id, then it is negative
    # here we assume, that the input was genereated correctly
    y_true = torch.isin(correct_ids, torch.tensor(pos_ids))

    acc = (y_true == y_pred).type(torch.float32).mean()
    return {"accuracy":acc, "y_pred":y_pred.numpy()}

In [None]:
# we reduce the number of returned logits by 30kx fold to safe vram!
pos_tensor = torch.tensor(pos_ids, device=model.device)
neg_tensor = torch.tensor(neg_ids, device=model.device)

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    probs = torch.softmax(logits, -1)
    # lets pre-compute the sums of positive and negative probabilities
    # this way, we only need to store [batch_size x seq_len x 2] bool values
    # before this, we needed [batch_size x seq_len x vocab_size]
    pos_probs = probs[:,:,pos_tensor].sum(axis=-1)
    neg_probs = probs[:,:,neg_tensor].sum(axis=-1)
    y_pred = pos_probs > neg_probs
    return y_pred

In [None]:
from transformers import DataCollatorForLanguageModeling

# set padding token if necessary
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

dc = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="pt")

In [None]:
trainer = Trainer(
    model=model,
    data_collator=dc,
    args=training_args,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

In [None]:
results = {}
for split, data in tokenized_data.items():
    r = trainer.evaluate(data, metric_key_prefix=f"{split}")
    results[split] = r

In [None]:
from functools import reduce
from collections import OrderedDict

preds = OrderedDict({k:pd.DataFrame(v[f"{k}_y_pred"]) for k, v in results.items()})

all_preds = reduce(lambda l,r: pd.concat([l,r], axis=1), preds.values())
all_preds.columns = preds.keys()

In [None]:
with open(logdir + "prompt_results.jsonl", "w") as f:
    f.write(all_preds.to_json(orient="records", lines=True))