# Preparing Moral Stories for NLI with Learn2Split
***
Two steps to be done:
1. Split the norms in the Moral Stories dataset into values and actions
2. Extract the names of the actors
3. Make stories from actions and actor names

In [5]:
from ailignment.datasets.moral_stories import get_moral_stories, make_action_classification_dataframe
from ailignment.datasets import get_accuracy_metric, join_sentences, tokenize_and_split
import pandas as pd
import datasets
import transformers
import numpy as np
from ailignment import sequence_classification

pd.set_option('display.max_colwidth', 400)

#transformers.logging.set_verbosity_warning()

from collections import Counter

import spacy
from spacy import displacy
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

## Applying Learn2Split
***

In [None]:
dataframe = get_moral_stories()

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

name = "../data/models/learn_to_split"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSeq2SeqLM.from_pretrained(name).cuda()

In [None]:
from datasets import Dataset

def parse_split(x):
    if "." in x:
        value, action = x.split(".",1)
    else:
        value, action = x.split(" ", 1)
    action = action.strip()
    return {"action": action, "value":value}

def split(x):
    inputs = tokenizer(x["norm"], padding="max_length", truncation=True, return_tensors="pt")
    inputs = {k:v.cuda() for k,v in inputs.items()}
    out = model.generate(**inputs, do_sample=True, min_length=1, max_length=100, top_p=0.95, top_k=50, 
                         num_beams=1, temperature=1.0)
    x["l2s_output"] = tokenizer.batch_decode(out, skip_special_tokens=True)
    pairs = pd.DataFrame.from_records([parse_split(y) for y in x["l2s_output"]])
    
    x["norm_action"] = pairs["action"].to_list()
    x["norm_value"] = pairs["value"].to_list()

    return x

In [None]:
# convert to huggingface dataset to make use of their batch processing
# (I really just wanted the progress bar...)
data = Dataset.from_pandas(dataframe)
dataframe = data.map(split, batch_size=32, batched=True).to_pandas()

In [None]:
# save the dataframe for later use
dataframe.to_pickle("../data/moral_stories_proto_l2s.dat")

## Extracting the actor names
***
Our simple assumption: The name that comes up most in the row is likely to be the central person in the situation, a.k.a the actor. Therefore, we stitch together all parts of each moral story, apply POS tagging and find the most frequent 

In [106]:
dataframe = pd.read_pickle("../data/moral_stories_proto_l2s.dat")
# remove ""
def unquote(s):
    if not isinstance(s, str): return s
    if len(s) == 0: return s
    if s[0] in "\"'" and s[-1] in "\"'":
        s = s[1:-1]
    return s
dataframe = dataframe.progress_applymap(unquote)

100%|██████████████████████████████████████████████████████████████████████| 143988/143988 [00:00<00:00, 754023.00it/s]


In [219]:
from spacy.tokens import Token, Span
from spacy.language import Language
from gender_guesser.detector import Detector

# add a token extension that looks itself up in a name dict

name_det = Detector(case_sensitive=True)
name_det.names.update({"Benard":{"male":"1"},
                       "Haru":{"male":"1"}, 
                       "Carlow":{"male":"1"},
                       "Bently":{"male":"1"},
                       "Doro":{"male":"1"}
                      })

def is_name(token):
    return token.text in name_det.names

Token.set_extension("is_name", getter=is_name, force=True)

# add a custom component that filters out the names from our entity ruler matches
@Language.component("name_filter")
def name_filter(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "NAME" and len(ent) > 1:
            new_ent = Span(doc, ent.start, ent.start+1, label=ent.label)
            new_ents.append(new_ent)
        else:
            new_ents.append(ent)
    # if there are more than one entity, try to get rid of non-names
    if len(new_ents)>1:
        new_ents = [x for x in new_ents if x[0]._.is_name]
    # take the first one, if there are still more
    if len(new_ents)>1:
        new_ents = new_ents[:1]
    doc.ents = new_ents
    return doc

In [220]:
# pos tag the moral stories with custom patterns to find names
nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat"])

ruler = nlp.add_pipe("entity_ruler", validate=True)
nlp.add_pipe("name_filter", after="entity_ruler")

do = {"OP":"?", "LEMMA":"do"}
nt = {"OP":"?", "LEMMA":"n't"}
adv = {"OP":"?", "POS":"ADV"}
action_verbs = [do, nt, adv, {"LEMMA":{"IN":["wants","want","need","have"]}}]

patterns = [
    {"label":"NAME", "pattern":[{"POS": {"IN":["PROPN","NOUN"]}}] + action_verbs},
    {"label":"NAME", "pattern":[{"_":{"is_name": True}}] + action_verbs},
]
ruler.add_patterns(patterns)

In [222]:
docs = dataframe["intention"].progress_apply(nlp)

100%|███████████████████████████████████████████████████████████████████████████| 11999/11999 [00:43<00:00, 275.44it/s]


In [223]:
# find all docs with ties
counts = docs.apply(lambda doc: Counter([x[0].text for x in doc.ents]))

In [229]:
# find all docs with ties, should be empty
counts = docs.apply(lambda doc: Counter([x[0].text for x in doc.ents]))

def f(c):
    if len(c) in {0,1}: return False
    a,b = c.most_common(2)
    if a[1] == b[1]: return True
    return False

d = dataframe[counts.apply(f)]
assert 0 == len(d[["intention"]])

#### Special rules
***
We apply some special rules for the rest of the bunch\
**Note:** The NLP pipeline should not be used again after this, since the additional rules are extremely sensitive and only suited for the remaining special cases!

In [255]:
weak_patterns = [
    {"label":"NAME", "pattern":[
        {"IS_SENT_START":True,"POS": {"IN":["PROPN","NOUN"]}}
    ]}
]
ruler.add_patterns(weak_patterns)

In [256]:
# find all docs without any names
remaining = dataframe["intention"][docs.apply(lambda x: len(x.ents)==0)]
docs_remaining = remaining.apply(nlp)

d  = docs_remaining[docs_remaining.apply(lambda x: len(x.ents)==0)]
d

255                             (The, alcohol, made, Connie, horny, ,, and, she, wanted, to, have, sex, .)
1296                        (She, wants, to, ask, her, friend, April, to, give, her, a, ride, to, work, .)
1979                                                                (She, wants, to, spend, the, money, .)
2356                                                            (She, wants, to, hire, a, new, manager, .)
2360                                          (She, wants, to, prepare, for, trick, -, or, -, treaters, .)
2446                                      (He, sees, a, toy, he, likes, and, wants, to, take, it, home, .)
2706                                                   (She, wants, to, give, them, something, to, eat, .)
4021                                          (She, wants, to, decide, what, to, do, with, the, wallet, .)
4291                                                     (She, wants, to, let, him, do, something, fun, .)
4562                                 

In [248]:
doc = docs_remaining[645]
[(x.text,x.lemma_, x.pos_) for x in doc]

[('Mark', 'Mark', 'PROPN'),
 ('is', 'be', 'AUX'),
 ('nervous', 'nervous', 'ADJ'),
 ('about', 'about', 'ADP'),
 ('being', 'be', 'VERB'),
 ('away', 'away', 'ADV'),
 ('and', 'and', 'CCONJ'),
 ('wants', 'want', 'VERB'),
 ('to', 'to', 'PART'),
 ('calm', 'calm', 'VERB'),
 ('his', 'his', 'PRON'),
 ('mind', 'mind', 'NOUN'),
 ('.', '.', 'PUNCT')]

In [235]:
ruler.patterns

[{'label': 'NAME',
  'pattern': [{'POS': {'IN': ['PROPN', 'NOUN']}},
   {'OP': '?', 'LEMMA': 'do'},
   {'OP': '?', 'LEMMA': "n't"},
   {'OP': '?', 'POS': 'ADV'},
   {'LEMMA': {'IN': ['wants', 'want', 'need', 'have']}}]},
 {'label': 'NAME',
  'pattern': [{'_': {'is_name': True}},
   {'OP': '?', 'LEMMA': 'do'},
   {'OP': '?', 'LEMMA': "n't"},
   {'OP': '?', 'POS': 'ADV'},
   {'LEMMA': {'IN': ['wants', 'want', 'need', 'have']}}]}]