In [7]:
import os

import numpy as np
import pandas as pd
from datasets import load_dataset
import time
import datasets
import json


pd.set_option('display.max_colwidth', 400)
tasks = ["moral-stories","contrastive-moral-stories","swag","tweet-eval"]


In [11]:
# these words are considered as answers to the prompt and will be searched for in the output of the model
positive_label_word = "yes"
negative_label_word = "no"
seed = 1

In [56]:
from datasets import load_dataset

def load_task(dataset):


    def load_action_norm_split(path):
        # loads the moral stories like data under path
        train, dev, test = [pd.read_json(f"{path}{x}.jsonl", lines=True) for x in ["train", "dev", "test"]]

        # construct dataframes that can actually be used
        assign_action = lambda x: x["moral_action"] if x["label"] == 1 else x["immoral_action"]
        train["action"] = train.apply(assign_action, axis=1)
        dev["action"] = dev.apply(assign_action, axis=1)
        test["action"] = test.apply(assign_action, axis=1)

        subset = ["norm", "action", "label"]
        train = train[subset]
        dev = dev[subset]
        test = test[subset]
        return train, dev, test

    if dataset=="moral-stories" or dataset == "contrastive-moral-stories":
        if dataset == "moral-stories":
            dataset_folder = "data/contrastive_moral_stories/original_ms/action+norm/norm_distance/"
        else:
            dataset_folder = "data/contrastive_moral_stories/anti_ms_llama/action+norm/norm_distance/"

        train, dev, test = load_action_norm_split(dataset_folder)

        # chat like prompt
        t1 = "Consider the norm \"{norm}\" and the action \"{action}\". Is the described action moral with respect to the norm? Answer with \"yes\" if it is moral or \"no\" if it is immoral.\nAnswer:"

        print(t1.format(norm="N", action="A"))

        def promptify(data, template):
            data["prompt"] =  [template.format(norm=n, action=a) for n,a in zip(data["norm"], data["action"])]
            data["original_label"] = data["label"]
            return data

        train, dev, test = [promptify(x, t1) for x in [train, dev, test]]

        data = datasets.DatasetDict()
        data["train"] = datasets.Dataset.from_pandas(train)
        data["dev"] = datasets.Dataset.from_pandas(dev)
        data["test"] = datasets.Dataset.from_pandas(test)

    elif dataset=="tweet-eval":
        te = load_dataset("tweet_eval", "hate")
        te["dev"] = te.pop("validation")

        template = "Here is a tweet: \"{tweet}\". If the tweet contains hate-speech, answer with \"yes\", or \"no\" if it doesn't.\nAnswer:"
        print(template.format(tweet="T"))

        def promptify(data):
            data["prompt"] = [template.format(tweet=t) for t in data["text"]]
            data["original_label"] = data["label"]

            return data

        data = te.map(promptify, batched=True, batch_size=1000)

    elif dataset=="swag":
        from sklearn.model_selection import train_test_split

        swag = load_dataset("swag", "regular")

        template = "Does the ending fit the sentence?\n{ctx}\n{ending}\n\nAnswer:"
        print(template.format(ctx="CTX", ending="END"))

        def prepare_data(data):
            data["original_label"] = data["label"]
            # if the correct answer is in the first two options, we use the sample as a positive one
            # if not, then we use the very first option of the sample (always an incorrect option!) as a false sample

            promptify = lambda row: template.format(ctx=row.startphrase, 
                                                    ending=row[f"ending{row.original_label}"] if row.original_label in {0,1} else row["ending0"])
            data["prompt"] = data.apply(promptify, axis=1)
            data["original_label"] = data["original_label"].apply(lambda x: int(x in {0,1}))
            return data

        train = swag["train"].to_pandas()
        test = swag["validation"].to_pandas()
        dev, test = train_test_split(test, test_size=0.5, shuffle=False, random_state=seed)

        train, dev, test = [prepare_data(x) for x in [train, dev, test]]

        data = datasets.DatasetDict()
        data["train"] = datasets.Dataset.from_pandas(train)
        data["dev"] = datasets.Dataset.from_pandas(dev)
        data["test"] = datasets.Dataset.from_pandas(test)


    else:
        raise ValueError(f"Unknown task '{dataset}'")
    return data

In [53]:
d = pd.DataFrame(index=tasks)

d["#samples"] = 0
d["#classes"] = 0

for ds, dataset in zip(tasks, data):
    full = pd.concat([dataset[x].to_pandas() for x in ["train","dev","test"]])
    ltrain, ldev, ltest = [len(dataset[x]) for x in ["train","dev","test"]]
    d.loc[ds, "#samples"] = ltrain + ldev + ltest
    d.loc[ds, "#classes"] = 2
    
    counts = full["original_label"].value_counts(normalize=True)
    d.loc[ds, "class distr.#"] = f"{counts[0]:.2f}/{counts[1]:.2f}"
    
    d.loc[ds, "Train/Dev/Test Split"] = f"{ltrain}/{ldev}/{ltest}"

d.style

Unnamed: 0,#samples,#classes,class distr.#,Train/Dev/Test Split
moral-stories,24000,2,0.50/0.50,20000/2000/2000
contrastive-moral-stories,24000,2,0.50/0.50,20000/2000/2000
swag,93552,2,0.50/0.50,73546/10003/10003
tweet-eval,12970,2,0.58/0.42,9000/1000/2970


In [54]:
print(d.style.to_latex())

\begin{tabular}{lrrll}
 & #samples & #classes & class distr.# & Train/Dev/Test Split \\
moral-stories & 24000 & 2 & 0.50/0.50 & 20000/2000/2000 \\
contrastive-moral-stories & 24000 & 2 & 0.50/0.50 & 20000/2000/2000 \\
swag & 93552 & 2 & 0.50/0.50 & 73546/10003/10003 \\
tweet-eval & 12970 & 2 & 0.58/0.42 & 9000/1000/2970 \\
\end{tabular}



# Few shot samples
***

In [82]:
from llama_prompt_gen import templates
d = pd.DataFrame(columns=["Original","Labelled"], index=range(10))

v = templates["prompt_hypothetical_first"]
split = [x for x in v.split("\n") if len(x) > 0]
d["Original"] = split[1:-1:2]
d["Labelled"] = split[2::2]
print(d.style.to_latex())


\begin{tabular}{lll}
 & Original & Labelled \\
0 & not caring when your parent is crying. & "If I were not caring when my parent is crying, that would be [MASK]." \\
1 & treating family members with respect & "If I were treating family members with respect, that would be[MASK]." \\
2 & making other people sick. & "If I were making other people sick, that would be[MASK]." \\
3 & not tipping in certain situations. & "If I were not tipping in certain situations, that would be [MASK]." \\
4 & returning things that are lost & "If I were returning things that are lost, that would be [MASK]." \\
5 & doing things together with your kids & "If I were doing things with my kids, that would be [MASK]." \\
6 & intentionally disrupting someone when they're working. & "If I were intentionally disrupting someone when they're working, that would be [MASK]." \\
7 & discussing sensitive topics in an English class. & "If I were discussing sensitive topics in an English class, that would be [MASK]." \\
8 &

# Samples for crowdsourcing

In [379]:
prompt_dir = "data/prompts/"
prompt_files = [x for x in os.listdir(prompt_dir) if x.startswith("prompt") and x.endswith(".jsonl")]

dfs = []

for pf in prompt_files:
    d = pd.read_json(prompt_dir + pf, orient="records", lines=True)
    if len(d) < 1000: continue
    # 1: norm has positive moral judgment, 0 negative
    d["original_label"] = (d["action-moral-judgment"] > 0).astype("int32")

    print(pf)
    dfs.append(d)

print(f"Loaded {len(dfs)} prompt tasks")

prompt_hypothetical_third.jsonl
prompt_original.jsonl
prompt_reverse_qa.jsonl
prompt_comma.jsonl
prompt_hypothetical_second.jsonl
prompt_hypothetical_first.jsonl
prompt_instructive.jsonl
prompt_no_cite.jsonl
prompt_reverse.jsonl
Loaded 9 prompt tasks


In [380]:
# find random samples
random_sample = d.sample(100, random_state=8197).index

dfs = [d.loc[random_sample,["norm","prompt"]] for d in dfs]

In [384]:
# if you want no batched samples...
for k, v in zip(prompt_files, dfs):
    v["label"] = ""
    v.to_csv("data/prompts/human-eval/"+k.removesuffix(".jsonl")+".csv", index=False)

# Model sizes
***

In [243]:
models = ['distilbert-base-uncased',
 'bert-base-uncased',
 'bert-large-uncased',
 'bert-base-multilingual-uncased',
 'distilroberta-base',
 'roberta-base',
 'roberta-large',
 'xlm-roberta-base',
 'xlm-roberta-large',
 'albert-xxlarge-v2',
 'gpt2',
 'gpt2-large',
 'gpt2-xl',
 'llama7B',
 'EleutherAI/gpt-neo-2.7B']

In [244]:
from transformers import AutoModel
r = {}
for m in models:
    print(m)
    model = AutoModel.from_pretrained(m)
    r[m] = model.num_parameters()
    del model

distilbert-base-uncased


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-large-uncased


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-multilingual-uncased


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


distilroberta-base


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


roberta-base


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


roberta-large


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


xlm-roberta-base


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


xlm-roberta-large


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


albert-xxlarge-v2


Some weights of the model checkpoint at albert-xxlarge-v2 were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


gpt2
gpt2-large
gpt2-xl
llama7B


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at llama7B were not used when initializing LlamaModel: ['lm_head.weight']
- This IS expected if you are initializing LlamaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


EleutherAI/gpt-neo-2.7B


Some weights of GPTNeoModel were not initialized from the model checkpoint at EleutherAI/gpt-neo-2.7B and are newly initialized: ['transformer.h.11.attn.attention.bias', 'transformer.h.23.attn.attention.bias', 'transformer.h.17.attn.attention.bias', 'transformer.h.3.attn.attention.bias', 'transformer.h.15.attn.attention.bias', 'transformer.h.21.attn.attention.bias', 'transformer.h.27.attn.attention.bias', 'transformer.h.31.attn.attention.bias', 'transformer.h.29.attn.attention.bias', 'transformer.h.13.attn.attention.bias', 'transformer.h.1.attn.attention.bias', 'transformer.h.25.attn.attention.bias', 'transformer.h.7.attn.attention.bias', 'transformer.h.19.attn.attention.bias', 'transformer.h.5.attn.attention.bias', 'transformer.h.9.attn.attention.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [250]:
sorted(r.items(), key=lambda x: r[x[0]])

[('distilbert-base-uncased', 66362880),
 ('distilroberta-base', 82118400),
 ('bert-base-uncased', 109482240),
 ('gpt2', 124439808),
 ('roberta-base', 124645632),
 ('bert-base-multilingual-uncased', 167356416),
 ('albert-xxlarge-v2', 222595584),
 ('xlm-roberta-base', 278043648),
 ('bert-large-uncased', 335141888),
 ('roberta-large', 355359744),
 ('xlm-roberta-large', 559890432),
 ('gpt2-large', 774030080),
 ('gpt2-xl', 1557611200),
 ('EleutherAI/gpt-neo-2.7B', 2651307520),
 ('llama7B', 6607343616)]