# Storifier-Transformer
***
For the `Moral Reasoning - Moral = Reasoning` experiment we need to exemplify the norm as an action:
* Given a norm: "It is bad to hurt someone"
* And an actor: "Kevin"
* Using the `Learn2Split` model we get (`bad`,`hurting someone`)
* We want: "Kevin hurts someone"

In this notebook, I want to test whether `seq2seq` transformers can generate such sentences given the input pair of `norm_action` and `name`.

Experiment outline:
1. Take the handcrafted `norm_stories` from the prototype
    1. We take the labeled subset as training data and the unlabeled are test
2. Train a `T5-small` model on the aforementioned data
3. Manually evaluate the outputs on test data
4. Test the model in a complete pipeline

300k: ("it is bad to pee"->("bad", "to pee")


In [1]:
from ailignment.datasets.moral_stories import get_moral_stories, make_action_classification_dataframe
from ailignment.datasets import get_accuracy_metric, join_sentences, tokenize_and_split
import pandas as pd
import datasets
import transformers
import numpy as np
from ailignment import sequence_classification

pd.set_option('display.max_colwidth', 400)

#transformers.logging.set_verbosity_warning()

from collections import Counter

import spacy
from spacy import displacy
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

In [3]:
l2s_data = pd.read_pickle("../data/moral_stories_proto_l2s.dat")
proto_data = pd.read_pickle("../data/moral_stories_proto_light.dat")

dataframe = proto_data.join(l2s_data[["norm_value","norm_action"]], how="inner")
test_frame = l2s_data.drop(proto_data.index).drop("__index_level_0__",axis=1)
# add random names to the test frame
test_frame["actor_name"] = dataframe["actor_name"].sample(len(test_frame)).to_list()

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSeq2SeqLM.from_pretrained(name)

In [5]:
# fix up huggingface dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split

def tok_inp(x):
    return tokenizer(x["norm_action"],x["actor_name"], padding="max_length", truncation=True)

def tok_out(x):
    with tokenizer.as_target_tokenizer():
        y = tokenizer(x["norm_story"], padding="max_length", truncation=True)
    return y

def preprocess(dataframe, test=False):
    dataset = Dataset.from_pandas(dataframe)
    dataset =  dataset.map(tok_inp, batched=True)
    if not test:
        labels = dataset.map(tok_out, batched=True)
        dataset = dataset.add_column("labels", labels["input_ids"])
    return dataset

train, val = train_test_split(dataframe, test_size=0.1)

train_data = preprocess(train)
eval_data = preprocess(val)
test_data = preprocess(test_frame, test=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [6]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

import torch

data_collator = DataCollatorForSeq2Seq(tokenizer, model)

training_args = Seq2SeqTrainingArguments(
    output_dir="results/",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy="epoch",
    save_steps=1000,
)

In [7]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: actor_gender, value, norm_value, moral_consequence, intention, situation, __index_level_0__, moral_action, ID, norm_sentiment, actor_name, norm_action, norm_story, immoral_consequence, immoral_action, norm, norm_devalued.
***** Running training *****
  Num examples = 7600
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 19000


Epoch,Training Loss,Validation Loss
1,0.0025,0.00155
2,0.0013,0.000767
3,0.0007,0.000559


Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000\config.json
Model weights saved in results/checkpoint-1000\pytorch_model.bin
tokenizer config file saved in results/checkpoint-1000\tokenizer_config.json
Special tokens file saved in results/checkpoint-1000\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: actor_gender, value, norm_value, moral_consequence, intention, situation, __index_level_0__, moral_action, ID, norm_sentiment, actor_name, norm_action, norm_story, immoral_consequence, immoral_action, norm, norm_devalued.
***** Running Evaluation *****
  Num examples = 845
  Batch size = 4
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000\config.json
Model weights saved in results/checkpoint-2000\pytorch_model.bin
tokenizer config file saved in results/checkpoint-2000\tokeni

KeyboardInterrupt: 

In [8]:
path = "../data/models/storifier"
model.save_pretrained(path)
tokenizer.save_pretrained(path)

Configuration saved in ../data/models/storifier\config.json
Model weights saved in ../data/models/storifier\pytorch_model.bin
tokenizer config file saved in ../data/models/storifier\tokenizer_config.json
Special tokens file saved in ../data/models/storifier\special_tokens_map.json


('../data/models/storifier\\tokenizer_config.json',
 '../data/models/storifier\\special_tokens_map.json',
 '../data/models/storifier\\tokenizer.json')

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained("results/checkpoint-4000/").cuda()

loading configuration file results/checkpoint-4000/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams

In [11]:
# run our test set
hard_samples = [7207, 9070, 736, 3080, 2766, 4928]
sample = test_frame.sample(10)
sample = test_frame.loc[hard_samples]
x = tokenizer(sample["norm_action"].to_list(),sample["actor_name"].to_list(), padding="max_length", return_tensors="pt")
x = {k:v.cuda() for k,v in x.items()}
y = model.generate(**x, do_sample=True, min_length=5, max_length=100, top_p=0.95, top_k=50, num_beams=10, temperature=1.0)
sample["story"] = tokenizer.batch_decode(y, skip_special_tokens=True)
sample[["actor_name","story", "norm_action"]]

Unnamed: 0,actor_name,story,norm_action
7207,Adam,Adam cleans his arms only,only cleaning your arms.
9070,Tom,Tom does not eat food his parent has prepared,not eating food your parent has prepared.
736,Ron,Ron betrays his friends confidence in he to others,betraying your friends confidence in you to others.
3080,Nancy,Nancy only thinks about herself when talking,only thinking about yourself when talking.
2766,Josh,Josh hurts animals if he love them,hurting animals if you love them.
4928,Heather,Heather not cancels plans,not canceling plans.
