# Preparing Moral Stories for NLI with Learn2Split
***
Two steps to be done:
1. Split the norms in the Moral Stories dataset into values and actions
2. Extract the names of the actors
3. Make stories from actions and actor names

In [1]:
from ailignment.datasets.moral_stories import get_moral_stories, make_action_classification_dataframe
from ailignment.datasets import get_accuracy_metric, join_sentences, tokenize_and_split
import pandas as pd
import datasets
import transformers
import numpy as np
from ailignment import sequence_classification

pd.set_option('display.max_colwidth', 400)

#transformers.logging.set_verbosity_warning()

from collections import Counter

import spacy
from spacy import displacy
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

## Applying Learn2Split
***

In [None]:
dataframe = get_moral_stories()

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

name = "../data/models/learn_to_split"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSeq2SeqLM.from_pretrained(name).cuda()

In [None]:
from datasets import Dataset

def parse_split(x):
    if "." in x:
        value, action = x.split(".",1)
    else:
        value, action = x.split(" ", 1)
    action = action.strip()
    return {"action": action, "value":value}

def split(x):
    inputs = tokenizer(x["norm"], padding="max_length", truncation=True, return_tensors="pt")
    inputs = {k:v.cuda() for k,v in inputs.items()}
    out = model.generate(**inputs, do_sample=True, min_length=1, max_length=100, top_p=0.95, top_k=50, 
                         num_beams=7, temperature=1.0)
    x["l2s_output"] = tokenizer.batch_decode(out, skip_special_tokens=True)
    pairs = pd.DataFrame.from_records([parse_split(y) for y in x["l2s_output"]])
    
    x["norm_action"] = pairs["action"].to_list()
    x["norm_value"] = pairs["value"].to_list()

    return x

In [None]:
# convert to huggingface dataset to make use of their batch processing
# (I really just wanted the progress bar...)
data = Dataset.from_pandas(dataframe)
dataframe = data.map(split, batch_size=32, batched=True).to_pandas()

In [None]:
num_empty = dataframe["norm_action"].apply(lambda x: len(x)==0).sum()
print("The model failed to predict an action for", num_empty,"rows")

num_empty = dataframe["norm_value"].apply(lambda x: len(x)==0).sum()
print("The model failed to predict a norm value for", num_empty,"rows")

## Estimating the sentiment
***


In [4]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
value_map = {v:classifier(v)[0]["label"] for v in dataframe["norm_value"].unique()}
dataframe["norm_sentiment"] = dataframe["norm_value"].apply(value_map.get)

## Extracting the actor names
***
Our simple assumption: The name that comes up most in the row is likely to be the central person in the situation, a.k.a the actor. Therefore, we stitch together all parts of each moral story, apply POS tagging and find the most frequent 

In [None]:
# remove ""
def unquote(s):
    if not isinstance(s, str): return s
    if len(s) == 0: return s
    if s[0] in "\"'" and s[-1] in "\"'":
        s = s[1:-1]
    return s
dataframe = dataframe.progress_applymap(unquote)

In [None]:
from spacy.tokens import Token, Span
from spacy.language import Language
from gender_guesser.detector import Detector

# add a token extension that looks itself up in a name dict

name_det = Detector(case_sensitive=True)
name_det.names.update({"Benard":{"male":"1"},
                       "Haru":{"male":"1"}, 
                       "Carlow":{"male":"1"},
                       "Bently":{"male":"1"},
                       "Doro":{"male":"1"},
                       "Thiago":{"male","1"}
                      })

def is_name(token):
    return token.text in name_det.names

Token.set_extension("is_name", getter=is_name, force=True)

# add a custom component that filters out the names from our entity ruler matches
@Language.component("name_filter")
def name_filter(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "NAME" and len(ent) > 1:
            new_ent = Span(doc, ent.start, ent.start+1, label=ent.label)
            new_ents.append(new_ent)
        else:
            new_ents.append(ent)
    # if there are more than one entity, try to get rid of non-names
    if len(new_ents)>1:
        new_ents = [x for x in new_ents if x[0]._.is_name]
    # take the first one, if there are still more
    if len(new_ents)>1:
        new_ents = new_ents[:1]
    doc.ents = new_ents
    return doc

In [None]:
# pos tag the moral stories with custom patterns to find names
nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat"])

ruler = nlp.add_pipe("entity_ruler", validate=True)
nlp.add_pipe("name_filter", after="entity_ruler")

do = {"OP":"?", "LEMMA":"do"}
nt = {"OP":"?", "LEMMA":"n't"}
adv = {"OP":"?", "POS":"ADV"}
action_verbs = [do, nt, adv, {"LEMMA":{"IN":["wants","want","need","have"]}}]

patterns = [
    {"label":"NAME", "pattern":[{"POS": {"IN":["PROPN","NOUN"]}}] + action_verbs},
    {"label":"NAME", "pattern":[{"_":{"is_name": True}}] + action_verbs},
]
ruler.add_patterns(patterns)

In [None]:
docs = dataframe["intention"].progress_apply(nlp)

In [None]:
# find all docs with ties, should be empty
counts = docs.apply(lambda doc: Counter([x[0].text for x in doc.ents]))

def f(c):
    if len(c) in {0,1}: return False
    a,b = c.most_common(2)
    if a[1] == b[1]: return True
    return False

d = dataframe[counts.apply(f)]
assert 0 == len(d[["intention"]])

#### Special rules
***
We apply some special rules for the rest of the bunch\
**Note:** The NLP pipeline should not be used again after this, since the additional rules are extremely sensitive and only suited for the remaining special cases!

1. Interpret all PROPN or NOUN at the start of a sentence as the name
2. The remaining names are taken from the "situation" column, which was manually checked to be okay

In [None]:
weak_patterns = [
    {"label":"NAME", "pattern":[
        {"IS_SENT_START":True,"POS": {"IN":["PROPN","NOUN"]}}
    ]}
]
ruler.add_patterns(weak_patterns)

In [None]:
# find all docs without any names and apply the weaker patterns
remaining = dataframe["intention"][docs.apply(lambda x: len(x.ents)==0)]
docs_remaining = remaining.apply(nlp)
# update docs series
docs[docs_remaining.index] = docs_remaining

In [None]:
remaining = dataframe["situation"][docs.apply(lambda x: len(x.ents)==0)].apply(nlp)
docs[remaining.index] = remaining

# finally get the names
names = docs.apply(lambda x: x.ents[0].text)

# make a quick spot check
potential_non_names = names[names.apply(lambda x: x not in name_det.names)].to_list()
#potential_non_names

In [None]:
dataframe["actor_name"] = names

# save the dataframe for later use
dataframe.to_pickle("../data/moral_stories_proto_l2s.dat")

# Create Norm-Stories
***
Now that we have actor names and the normative action extracted, we want to create the norm stories:
* "hurting someone else" + "Kevin" = "Kevin hurts someone else"

**NOTE:** We have to get rid of empty nom_actions or norm_values, which ideally should be only a few samples

We will apply our `storify-transformer` model to obtain the norm stories.

In [7]:
dataframe = pd.read_pickle("../data/moral_stories_proto_l2s.dat").drop("__index_level_0__",axis=1)

dataframe = dataframe[dataframe.apply(lambda x: len(x["norm_action"])>0 and len(x["norm_action"])>0, axis=1)]
len(dataframe)

11996

In [8]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

name = "results/checkpoint-1000/"
name= "../data/models/storifier/"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSeq2SeqLM.from_pretrained(name).cuda()

In [9]:
from datasets import Dataset

def storify(x):
    inputs = tokenizer(x["norm_action"], x["actor_name"], padding="max_length", truncation=True, return_tensors="pt")
    inputs = {k:v.cuda() for k,v in inputs.items()}
    out = model.generate(**inputs, do_sample=True, min_length=5, max_length=100, top_p=0.95, top_k=50, 
                         num_beams=5, temperature=1.0)
    x["norm_storyfied"] = tokenizer.batch_decode(out, skip_special_tokens=True)
    return x

In [10]:
# convert to huggingface dataset to make use of their batch processing
# (I really just wanted the progress bar...)
data = Dataset.from_pandas(dataframe)
dataframe = data.map(storify, batch_size=32, batched=True).to_pandas()

  0%|          | 0/375 [00:00<?, ?ba/s]

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [12]:
dataframe.to_pickle("../data/moral_stories_proto_l2s.dat")

In [7]:
dataframe["norm_storyfied"].apply(lambda x: len(x) == 0).sum()

1