In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
sys.path.append('../')
from FullParser.ClauseParser import ClauseParser

## Parsing tools and data directories

In [2]:
parser = ClauseParser()
import benepar, spacy
nlp = spacy.load('en_core_web_md')
if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

# Function for quick sentence processing
def nlp_sents(string):
    return list(nlp(string).sents)

# Golden Data file paths
dec_path_golden = "../Annotation/declarative_golden_set.json"
pol_path_golden = ".../Annotation/polar_golden_set.json"
alt_path_golden = "../Annotation/alternative_golden_set.json"
const_path_golden = "../Annotation/constituent_golden_set.json"
adv_path_golden = "../Annotation/adversarials_golden_set.json"
flat_path_golden = "../Annotation/golden_sets_flattened.json"

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
adv_test = pd.read_json(adv_path_golden, orient='index')

## Parse Golden sentences 

In [4]:
def parse_flat_golden(filename:str):
    golden_df = pd.read_json(filename, orient = 'index')
    golden_parses = []
    parser_parses = []
    for sent in golden_df.sentence.value_counts().to_dict().keys() :
        parses = [dict(row) for i,row in (golden_df[golden_df.sentence == sent]).iterrows()]
        golden_parses.append(parses)
        sent_doc = nlp(sent)
        parsed_sent = list(sent_doc.sents)[0]
        parser_parses.append(parser.parse_clauses(parsed_sent))
    return (parser_parses, golden_parses)

flat_parsed, flat_golden = parse_flat_golden(flat_path_golden)
adv_parsed, adv_golden = parse_flat_golden(adv_path_golden)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Evaluation functions

In [32]:
adv_parses = sum([len(p) for p in adv_parsed])

In [33]:
def get_predicate_string(predicate):
    if len(predicate) ==0:
        return ''
    pred_string = ''
    for item in predicate:
        pred_string += str(item['lemma']) + ' '
    return pred_string[:-1]


def filter_sentences_idx(filt):
    return [idx for idx in [i for i, e in enumerate(flat_golden) if filt(e)] ]

single_idx = filter_sentences_idx(lambda x: len(x) == 1)
multiple_idx = filter_sentences_idx(lambda x: len(x) > 1)



def compare_data(parsed,golden,feature):
    if feature=='predicate':
        return [[any([(get_predicate_string(gp['predicate']) == get_predicate_string(e['predicate'])) for e in parsed[i]])
                 for gp in gold]  
                for i,gold in enumerate(golden)]
    return [[any([gp[feature] == e[feature] for e in parsed[i]]) 
             for gp in gold]  
            for i,gold in enumerate(golden)]


### F1-scores for clause detection

In [40]:
def get_f1(tp, fp, fn):
  precision = tp/(tp+fp)
  recall = tp/(tp+fn)
  f1 = (2*precision*recall)/(precision + recall)
  return [precision, recall, f1]

def get_stats(parser,golden,subset='overall'):
    
    tp = 0
    tn = len(adv_golden) - adv_parses if subset == 'overall' else 0
    fp = adv_parses if subset == 'overall' else 0
    fn = 0 
    
    for i in range(len(golden)):

        
            
        gp = flat_golden[i]
        bp = flat_parsed[i]
        
        if subset == 'single':
            if len(gp) != 1:
                continue
        elif subset == 'multiple':
            if len(gp) == 1:
                continue
        
        if len(gp) == len(bp):
            tp += len(gp)
        elif len(gp) > len(bp):
            fn += len(gp) - len(bp)
            tp += len(bp)
        elif len(gp) < len(bp):
            fp += len(bp) - len(gp)
            tp += len(gp)
    
    print(f"---------Embedded Clause Detection, {subset} Clause F1---------")
    print("True Positives: ", tp, "\nFalse Positives: ", fp, "\nFalse Negatives: ", fn)
    print("Precision, Recall, F1: ", get_f1(tp, fp, fn))
    print("Accuracy: ", (tp+tn)/(tp+fp+fn+tn))
    print()

In [41]:
get_stats(flat_parsed,flat_parsed,'single')
get_stats(flat_parsed,flat_parsed,'multiple')
get_stats(flat_parsed,flat_parsed,'overall')

---------Embedded Clause Detection, single Clause F1---------
True Positives:  357 
False Positives:  40 
False Negatives:  21
Precision, Recall, F1:  [0.8992443324937027, 0.9444444444444444, 0.9212903225806451]
Accuracy:  0.854066985645933

---------Embedded Clause Detection, multiple Clause F1---------
True Positives:  123 
False Positives:  7 
False Negatives:  26
Precision, Recall, F1:  [0.9461538461538461, 0.825503355704698, 0.881720430107527]
Accuracy:  0.7884615384615384

---------Embedded Clause Detection, overall Clause F1---------
True Positives:  480 
False Positives:  53 
False Negatives:  47
Precision, Recall, F1:  [0.900562851782364, 0.9108159392789373, 0.9056603773584905]
Accuracy:  0.8540145985401459



## Overall Feature Detection Acuraccy

- Clause
- Clause type
- Predicate

In [63]:
# Failed sentence reproduction 
same_sentences = compare_data(flat_parsed,flat_golden,'sentence')
print('Overall sentence accuracy', np.mean(list(map(any,same_sentences))))

Overall sentence accuracy 0.9351230425055929


In [64]:
# Clause detection
detected_clauses = [len(x)>0 for x in flat_parsed]
print('Overall detection accuracy', np.mean(detected_clauses))
# clause

correct_clauses = compare_data(flat_parsed,flat_golden,'clause')
print('Overall clause accuracy', np.mean([np.mean(clause) for clause in correct_clauses]))

# Predicate detection

correct_predicates = compare_data(flat_parsed,flat_golden,'predicate')
print('Overall predicate accuracy', np.mean([np.mean(clause) for clause in correct_predicates]))

# Type
correct_types = compare_data(flat_parsed,flat_golden,'type')
print('Overall type accuracy', np.mean([np.mean(clause) for clause in correct_types]))


Overall detection accuracy 0.9440715883668904
Overall clause accuracy 0.8113348247576435
Overall predicate accuracy 0.9002609992542877
Overall type accuracy 0.9093959731543624


### Single Clause Evaluation

In [44]:
parsed_single = [flat_parsed[i] for i in single_idx]
golden_single = [flat_golden[i] for i in single_idx]


# clause and type

correct_clauses = compare_data(parsed_single,golden_single,'clause')
print('Clause accuracy', np.mean(correct_clauses))

correct_types = compare_data(parsed_single,golden_single,'type')
print('Type accuracy', np.mean(correct_types))

# Predicate detection

correct_predicates = compare_data(parsed_single,golden_single,'predicate')
print('Precicate accuracy',np.mean(correct_predicates))

failed_single_predicates = [golden_single[i][0]['sentence'] for i,e in enumerate(correct_predicates) if (not e[0] and correct_clauses[i])]



Clause accuracy 0.8306878306878307
Type accuracy 0.9153439153439153
Precicate accuracy 0.9126984126984127


### Multiple Clause Evaluation

In [45]:
parsed_multiple = [flat_parsed[i] for i in multiple_idx]
golden_multiple = [flat_golden[i] for i in multiple_idx]

# clause and type

correct_clauses = compare_data(parsed_multiple,golden_multiple,'clause')
# print(len(correct_clauses))
print('Multiple-clauses clause accuracy ',np.mean(list(map(np.mean,correct_clauses))))


# Predicate detection

correct_predicates = compare_data(parsed_multiple,golden_multiple,'predicate')
print('Multiple-clauses predicate accuracy',np.mean(list(map(np.mean,correct_predicates))))


correct_types = compare_data(parsed_multiple,golden_multiple,'type')
print('Multiple-type accuracy', np.mean(list(map(all,correct_types))))

Multiple-clauses clause accuracy  0.7053140096618357
Multiple-clauses predicate accuracy 0.8321256038647342
Multiple-type accuracy 0.8260869565217391


### Single predicate sentences

In [34]:
single_preds_idx = filter_sentences_idx(lambda x: any([len(c['predicate']) == 1 for c in x]))

parsed_single_pred = [flat_parsed[i] for i in single_preds_idx]
golden_single_pred = [flat_golden[i] for i in single_preds_idx]

# Detection

detected_single_pred = [len(gold) == len(parsed_single_pred[i]) for i,gold in enumerate(golden_single_pred)]
print('detection accuracy', np.mean(detected_single_pred))


# clause and type

correct_clauses_single_pred = compare_data(parsed_single_pred,golden_single_pred,'type')
print('clause accuracy ',np.mean(list(map(np.mean,correct_clauses_single_pred))))

# Predicate detection

correct_predicates_single_pred = compare_data(parsed_single_pred,golden_single_pred,'predicate')
print('predicate accuracy',np.mean(list(map(all,correct_predicates_single_pred))))


failed_single_preds = [gp[0]['sentence'] for i,gp in enumerate(golden_single_pred) if (detected_single_pred[i] and  not any(correct_predicates_single_pred[i]))]

detection accuracy 0.8260869565217391
clause accuracy  0.9211956521739131
predicate accuracy 0.9211956521739131


## Adversarial sentences

In [52]:
false_positives = [parse for parse in adv_parsed if len(parse)>0]
false_positive_sentences = [parse[0]['sentence'] for parse in false_positives]

In [53]:
false_positive_sentences

['The phone call gets passed around a number of confused staff members, which adds to the hilarity of the video',
 'And if you do not want to purchase clothing, I do have a few clothing options if you’d like to come to the studio to take a look.',
 'March 27, 2019, Christian County, US 6: A 67 year-old Reeds Spring, MO man was killed when a Chevy Impala crossed the median and struck his Harley Davidson motorcycle, two other motorcyclist were seriously injured as one was also hit head-on and another motorcyclist was struck by debris',
 'March 25, 2019; Stone County, MO 143: An 82 year-old Crane, MO woman was killed when her Chevy HHR was struck head-on by a Ford Ranger driven by a 68 year-old Marionville, MO man',
 'February 17, 2019, Adair County, Hwy 6: A 34 year-old Brashear, MO man was seriously injured along with his 10 year-old son when a 28 year-old Canton, MO man crossed the center line on icy highway and struck their Jeep head-on',
 'Louis, MO woman and an 18 year-old Hannibal,

In [54]:
precision = sum(detected_clauses)/(sum(detected_clauses)+len(false_positive_sentences))
recall = sum(detected_clauses)/(sum(detected_clauses)+447-sum(detected_clauses))

In [55]:
(precision,recall, 2*precision*recall/(precision+recall))

(0.985981308411215, 0.9440715883668904, 0.9645714285714286)

# Failure Analyses

In [51]:
failed_detects = [gp[0]['sentence'] for i,gp in enumerate(flat_golden) if not detected_clauses[i]]
failed_clauses = [gp[0]['sentence'] for i,gp in enumerate(flat_golden) if (detected_clauses[i] and  not any(correct_clauses[i]) and all(correct_predicates[i]))] 
failed_preds = [gp[0]['sentence'] for i,gp in enumerate(flat_golden) if (all(same_sentences[i]) and detected_clauses[i] and  not any(correct_predicates[i]))]
failed_detects

['It focuses mainly on Megan and how she’s thinking and feeling, and what she’s going to do, and less so on what happens to her physically, but it was good!',
 'Through those conversations, everything was explained bout what Megan’s options were; keeping the baby or having it adopted, and what either choice would mean for her as she is 15.',
 'Jessica: And so it is this idea of just accepting who you are as a person and what you’re like as a person.',
 "Whether Vengeance will include Sam Loeb's #26, and how DC will collect Mark Verheiden's issues after Jeph Loeb departs, remains to be seen.",
 'The economy may be improving slightly but, perhaps more importantly, the other guys are talking about restricting the accessibility of birth control or whether or not women should be in the workplace or the extent to which the devil has infiltrated American institutions among other crazy things.',
 'Therefore, those who have not received the overtime pay they are entitled to may want to learn mo

### Some useful fonctions for probing

In [13]:
def replace_brackets(parsed_string):
    # Replace round brackets with square brackets
    replaced_string = parsed_string.replace('(', '[').replace(')', ']')
    # Wrap the string with \begin{forest} and \end{forest}
    final_string = '\\begin{adjustbox}{width=0.8\\linewidth}' + '\\begin{forest} ' + replaced_string + ' \\end{forest}' + '\\end{adjustbox}\\\\'
    return final_string

def copy_latex_parse(sentence):
    ps = list(nlp(sentence).sents)[0]
    return replace_brackets(ps._.parse_string)

# Find parse of sentences matching keywords
def find_parse(string):
    return [parse for parse in flat_parsed if (lambda x: (string in x[0]['sentence']) if len(x) > 0 else False)(parse)]
# Find golden parse of sentences matching keywords
def find_golden_parse(string):
    return [gp for gp in flat_golden if (string in gp[0]['sentence'])]

import nltk
def nlp_parse(sent):
    return nltk.Tree.fromstring(list(nlp(sent).sents)[0]._.parse_string).pretty_print()

In [133]:
nlp_parse(find_parse('are distinguished')[0][0]['sentence'])

                                                                                          S                                                                         
              ____________________________________________________________________________|_______________________________________________________________________   
             |                                  |         S                                                                                                       | 
             |                                  |     ____|_________________                                                                                      |  
             |                                  |    |                      VP                                                                                    | 
             |                                  |    |     _________________|_____________                                                                        |  
       



In [16]:
failed_preds

['At the post-secondary level, students fall into four categories, depending on whether their work placement is paid or unpaid, and whether their work placement is optional or a mandatory requirement for graduation.',
 "Doctors and medical malpractice insurance companies currently must live with this 'bad outcome' ruling, although it is often unknown whether a better outcome was possible.",
 "I still get confused as to whether I'm being selfish and in typing this comment I can actually feel myself looking for affirmation from you and other posters.",
 'Multiplayer will include objectives that will differ depending on whether a player has chosen to fight for the Allies or Axis.',
 'That said, it matters little whether I am over-eating or under-eating.',
 'It is unclear to us whether the word "new" in this measure applies only to MOUs or whether it also applies to other pension contracts or agreements, such as those delineated in statute or those applicable to managers and supervisors.',

In [22]:
# View the golden parses matching query
fail_idx = 7
fail_sentences = failed_preds
print('sentence', fail_sentences[fail_idx],'\n')

print('Golden',[(gp['predicate'], gp['clause']) for gp in find_golden_parse(fail_sentences[fail_idx])[0]])
print('\n')
print('Parsed',[(p['predicate'],p['clause']) for p in find_parse(fail_sentences[fail_idx])[0]])

# parser.parse_clauses(nlp_sents(failed_preds[fail_idx])[0])
nlp_parse(fail_sentences[fail_idx])

sentence The other two cats stood poised nearby, as if undecided on whether to back up their fellow or choose the better part of valor. 

Golden [([{'str': 'undecided', 'lemma': 'undecided', 'POS': 'ADJ'}, {'str': 'on', 'lemma': 'on', 'POS': 'ADP'}], 'whether to back up their fellow or choose the better part of valor')]


Parsed [([{'str': 'stood', 'lemma': 'stand', 'POS': 'VERB'}, {'str': 'poised', 'lemma': 'poise', 'POS': 'VERB'}, {'str': 'undecided', 'lemma': 'undecided', 'POS': 'ADJ'}, {'str': 'on', 'lemma': 'on', 'POS': 'ADP'}], 'whether to back up their fellow or choose the better part of valor')]
                                                                      S                                                                               
       _______________________________________________________________|_____________________________________________________________________________   
      |                           VP                                                  

In [17]:
import pyperclip 

trees = ''
for sentence in failed_preds:
    trees += replace_brackets(list(nlp(sentence).sents)[0]._.parse_string) + '\n' 
pyperclip.copy(trees)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [31]:
gold_clauses = []
for gold in flat_golden:
    for gp in gold:
        gold_clauses.append(gp['clause'])

# Quick reflection of the golden single predicates
gold_predicates = []
for gold in flat_golden:
    for gp in gold:
        gold_predicates.append(gp['predicate'])
gold_single_preds = pd.Series([pred[0]['lemma'] for pred in  filter(lambda x: len(x)==1,gold_predicates)])
gold_single_preds.value_counts()[0:10]


[pred for pred in gold_predicates if  len(pred)>2]

# Quick reflection of the parsed single predicates
parsed_predicates = []
for sent_parse in flat_parsed:
    for parse in sent_parse:
        parsed_predicates.append(parse['predicate'])
parsed_single_preds = pd.Series([pred[0]['lemma'] for pred in  filter(lambda x: len(x)==1,parsed_predicates)])
parsed_single_preds.value_counts()[0:10]

know          58
decide        30
say           26
ask           21
think         15
determine     14
tell          14
wonder        13
see           12
understand    10
Name: count, dtype: int64

In [43]:
set([' '.join(p['str'] for p in pred) for pred in gold_predicates])


{"'m aware of",
 "'m sure",
 "'re curious",
 "'re sure",
 "'s clear",
 "'s telling",
 'Claims',
 'Deciding',
 'Find out',
 'Measuring',
 'Query',
 'Wonder',
 'accepting',
 'addressing',
 'admit',
 'admits',
 'admitting',
 'agreed',
 'analyzes',
 'analyzing',
 'announced',
 'apologize for',
 'appreciate',
 'are sure',
 'are unclear as to',
 'argue',
 'argues',
 'ask',
 'asked',
 'asked about',
 'asking',
 'assess',
 'based on',
 'be irrelevant',
 'be random',
 'believe',
 'believes',
 'capture',
 'care',
 'caring about',
 'check',
 'choosing',
 'claims',
 'clarify',
 'comment on',
 'concerns',
 'conclude',
 'configure',
 'confirm',
 'confused as to',
 'consider',
 'considered',
 'considering',
 'considers',
 'control',
 'debating over',
 'decide',
 'decided',
 'decides',
 'deciding',
 'declaring',
 'defined',
 'demonstrated',
 'demonstrates',
 'denies',
 'deny',
 'depend on',
 'depending on',
 'depends on',
 'determine',
 'determining',
 'dictate',
 'discern',
 'discover',
 'discuss',
 

In [49]:
# Parsed predicates with more than 4 items (many more than the golden set)
list(pred for pred in parsed_predicates if len(pred)>4)

[[{'str': 'says', 'lemma': 'say', 'POS': 'VERB'},
  {'str': 'much', 'lemma': 'much', 'POS': 'ADJ'},
  {'str': 'has', 'lemma': 'have', 'POS': 'AUX'},
  {'str': 'enjoyed', 'lemma': 'enjoy', 'POS': 'VERB'},
  {'str': 'working', 'lemma': 'work', 'POS': 'VERB'},
  {'str': 'for', 'lemma': 'for', 'POS': 'ADP'}],
 [{'str': "'s", 'lemma': 'be', 'POS': 'AUX'},
  {'str': 'impossible', 'lemma': 'impossible', 'POS': 'ADJ'},
  {'str': 'find', 'lemma': 'find', 'POS': 'VERB'},
  {'str': 'out', 'lemma': 'out', 'POS': 'ADP'},
  {'str': 'half', 'lemma': 'half', 'POS': 'ADJ'},
  {'str': 'is', 'lemma': 'be', 'POS': 'AUX'}],
 [{'str': 'provided', 'lemma': 'provide', 'POS': 'VERB'},
  {'str': 'by', 'lemma': 'by', 'POS': 'ADP'},
  {'str': 'on', 'lemma': 'on', 'POS': 'ADP'},
  {'str': 'much', 'lemma': 'much', 'POS': 'ADJ'},
  {'str': 'public', 'lemma': 'public', 'POS': 'ADJ'},
  {'str': 'should', 'lemma': 'should', 'POS': 'AUX'},
  {'str': 'charge', 'lemma': 'charge', 'POS': 'VERB'}],
 [{'str': 'care', 'lemma'

In [48]:
# Parsed predicates with more then one verb per embedding predicate (more than in the golden set)
list(pred for pred in parsed_predicates if (lambda x: len([pr for pr in pred if pr['POS'] == 'VERB'])>1)(pred))

[[{'str': 'says', 'lemma': 'say', 'POS': 'VERB'},
  {'str': 'much', 'lemma': 'much', 'POS': 'ADJ'},
  {'str': 'has', 'lemma': 'have', 'POS': 'AUX'},
  {'str': 'enjoyed', 'lemma': 'enjoy', 'POS': 'VERB'},
  {'str': 'working', 'lemma': 'work', 'POS': 'VERB'},
  {'str': 'for', 'lemma': 'for', 'POS': 'ADP'}],
 [{'str': 'comment', 'lemma': 'comment', 'POS': 'VERB'},
  {'str': 'on', 'lemma': 'on', 'POS': 'ADP'},
  {'str': 'planning', 'lemma': 'plan', 'POS': 'VERB'}],
 [{'str': 'fall', 'lemma': 'fall', 'POS': 'VERB'},
  {'str': 'depending', 'lemma': 'depend', 'POS': 'VERB'},
  {'str': 'on', 'lemma': 'on', 'POS': 'ADP'}],
 [{'str': 'fall', 'lemma': 'fall', 'POS': 'VERB'},
  {'str': 'depending', 'lemma': 'depend', 'POS': 'VERB'},
  {'str': 'on', 'lemma': 'on', 'POS': 'ADP'}],
 [{'str': 'fall', 'lemma': 'fall', 'POS': 'VERB'},
  {'str': 'depending', 'lemma': 'depend', 'POS': 'VERB'},
  {'str': 'on', 'lemma': 'on', 'POS': 'ADP'}],
 [{'str': 'provided', 'lemma': 'provide', 'POS': 'VERB'},
  {'str'