# Loading the `Moral-Stories` dataset
***
The dataset and code can be found <a href="https://github.com/demelin/moral_stories">here</a>.\
The authors provide 12k unique norms and, for some reason, additional 700k variations of the same norms, just with NaN fields every now and then. Zero additional information, but maybe I am overlooking something here?
* Might be for different tasks? But then they only provide a single label which is always 1 for any NaN rows...

# Sample task: Action classification
***
For starters, let's reproduce a task from the paper:
* Given an action, predict whether it is moral or immoral.
* For simplicity, we do not use the splits introduced in the paper, but rather random splitting

We start by loading the data as a `pandas.DataFrame`:

In [None]:
from ailignment.datasets.moral_stories import get_moral_stories, make_action_classification_dataframe
from ailignment.datasets import get_accuracy_metric, join_sentences, tokenize_and_split
from transformers import TrainingArguments
import pandas as pd

dataframe = get_moral_stories()

## Task 1: Action only
***
We'll only give single sentences to the model for now. Let's start by feeding the actions.

In [None]:
import datasets
test_split = 0.2
batch_size = 8
model = "distilbert-base-uncased"
#model = "albert-base-v2"
action_dataframe = make_action_classification_dataframe(dataframe)
input_columns = ["action", "consequence"]
action_dataframe["task_input"] = join_sentences(action_dataframe, input_columns, " ")
dataset = datasets.Dataset.from_pandas(action_dataframe)
dataset = dataset.train_test_split(test_size=test_split)

In [None]:
def data_all(tokenizer):
    return tokenize_and_split(dataset, tokenizer, "task_input")
def data_small(tokenizer):
    train, test = data_all(tokenizer)
    train = train.shuffle(seed=42).select(range(500))
    test = test.shuffle(seed=42).select(range(500))
    return train, test

In [None]:
training_args = TrainingArguments(
    output_dir="results/",
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=5,
    save_steps=1000,
    save_total_limit=2,
    evaluation_strategy="epoch",
)

In [None]:
import transformers
transformers.logging.set_verbosity_warning()

In [None]:
from ailignment import sequence_classification
r = sequence_classification(data_small, model, get_accuracy_metric(), training_args)
print(r)

# WIP: Get score output from LM
***
Question: Is there a better way to sample from generated LM outputs?

In [None]:
from transformers import (
    AutoModelForSequenceClassification, DistilBertTokenizerFast,
     Trainer, TrainingArguments, AutoModelWithLMHead, AutoTokenizer,
)
import torch

model = "distilbert-base-uncased"
model = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelWithLMHead.from_pretrained(model)

prompt = "Today the weather is really nice and I am planning on "
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")

prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
outputs = model.generate(inputs, max_length=250, do_sample=False, top_p=0.95, top_k=60,
                        return_dict_in_generate=True, output_attentions=False,
                        output_hidden_states=True, output_scores=True)
#generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]

p = torch.softmax(outputs.scores[0], dim=1)

print(p.max())

# WIP: Data augmentation with NER
***
Idea: Use Named entity recognition to find and replace persons etc. 

In [None]:
from ailignment.datasets.moral_stories import get_moral_stories, make_action_classification_dataframe
from ailignment import join_sentences, tokenize_and_split, get_accuracy_metric
dataframe = get_moral_stories()
columns = dataframe.columns[1:]
print("Running NER on columns", columns.to_list())

In [None]:
texts = join_sentences(dataframe ,columns, "\n")

In [None]:
import spacy
from spacy import displacy
from tqdm import tqdm
import pandas as pd

nlp = spacy.load("en_core_web_sm")

ner_pipe = nlp.pipe(tqdm(texts), disable=['tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
docs = [x for x in ner_pipe]

displacy.render(docs[0], style="ent")

In [None]:
from collections import Counter

def get_frequent_entity(doc, entity="PERSON", n=1):
    '''
    Returns the highest number of occurences of an
    entity in the NER doc.
    '''
    occurences = [(x.text, x.label_) for x in doc.ents if x.label_ == entity]
    c = Counter(occurences)
    ents = []
    for item, count in c.most_common(n):
        ents.append([x for x in doc.ents if (x.text, x.label_) == item])
    
    if n == 1 and len(ents) != 0:
        ents = ents[0]
    return ents

In [None]:
persons = [get_frequent_entity(x, "PERSON",n=1) for x in docs]
# we are interested in the simplest case, where the NER found
# exactly 6 matches
matches = [x for x in persons if len(x) == 6]
print(f"Found {len(matches)} matches")

In [None]:
m = matches[0]
displacy.render(m[0].doc, "ent")

In [None]:
def replace_entity(ents, s):
    '''
    Replaces all occurences of entities in `ents` with `s`.
    `ents` is a list of entities as returned by `doc.ents`
    from an NER pipeline, they need to be from the same doc!
    '''
    offset = 0
    text = ents[0].doc.text
    new_text = ""
    for ent in ents:
        start = ent.start_char
        end = ent.end_char
        left = text[offset:start]
        new_text += left + s
        offset = end
    new_text += text[offset:]
    return new_text

In [None]:
n_docs = [replace_entity(m, "Niklas").split("\n") for m in matches]

In [None]:
n_docs = [m[0].doc.text.split("\n") for m in matches]

In [None]:
dataframe_replaced = pd.DataFrame(n_docs)
dataframe_replaced.columns = columns
dataframe_replaced.head()

In [None]:
import datasets
test_split = 0.2
batch_size = 8

action_dataframe = make_action_classification_dataframe(dataframe_replaced)

input_columns = ["action"]
action_dataframe["task_input"] = join_sentences(action_dataframe, input_columns, " ")
dataset = datasets.Dataset.from_pandas(action_dataframe)
dataset = dataset.train_test_split(test_size=test_split)


In [None]:

from transformers import (
    AutoModelForSequenceClassification, DistilBertTokenizerFast,
     Trainer, TrainingArguments, AutoModelWithLMHead, AutoTokenizer,
)
import torch

model = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)

train_data, test_data = tokenize_and_split(dataset, tokenizer, "task_input")

# for prototyping, optional
small_train_data = train_data.shuffle(seed=42).select(range(1000))
small_test_data = test_data.shuffle(seed=42).select(range(1000))

training_args = TrainingArguments(
    output_dir="results/",
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=50,                # how often to log
    save_steps=1000,
    save_total_limit=0,
    evaluation_strategy="epoch",     # when to run evaluation
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=small_train_data,   # training dataset
    eval_dataset=small_test_data,     # evaluation dataset
    compute_metrics=get_accuracy_metric,     # code to run accuracy metric
)
trainer.train()

In [None]:
from gender_guesser.detector import Detector

In [None]:
d = Detector()

In [None]:
d.get_gender("Jamie")

In [None]:
m

In [None]:
import torch
a = torch.rand(40000,20000).cuda()
while True:
    a += 1
    a -= 1