In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
sys.path.append('../')
from src.ClauseParser import ClauseParser

## Parsing tools and data directories

In [2]:
parser = ClauseParser()
import benepar, spacy
nlp = spacy.load('en_core_web_md')
if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

# Function for quick sentence processing
def nlp_sents(string):
    return list(nlp(string).sents)

# Golden Data file paths
adv_path_golden = "../Annotation/adversarials_golden_set.json"
flat_path_golden = "../Annotation/golden_sets_flattened.json"

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
adv_test = pd.read_json(adv_path_golden, orient='index')

## Parse Golden sentences 

In [4]:
def parse_flat_golden(filename:str):
    golden_df = pd.read_json(filename, orient = 'index')
    golden_parses = []
    parser_parses = []
    for sent in golden_df.sentence.value_counts().to_dict().keys() :
        parses = [dict(row) for i,row in (golden_df[golden_df.sentence == sent]).iterrows()]
        golden_parses.append(parses)
        sent_doc = nlp(sent)
        parsed_sent = list(sent_doc.sents)[0]
        parser_parses.append(parser.parse_clauses(parsed_sent))
    return (parser_parses, golden_parses)

flat_parsed, flat_golden = parse_flat_golden(flat_path_golden)
adv_parsed, adv_golden = parse_flat_golden(adv_path_golden)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Evaluation functions

In [5]:
adv_parses = sum([len(p) for p in adv_parsed])

In [6]:
def get_predicate_string(predicate):
    if len(predicate) ==0:
        return ''
    pred_string = ''
    for item in predicate:
        pred_string += str(item['lemma']) + ' '
    return pred_string[:-1]


def filter_sentences_idx(filt):
    return [idx for idx in [i for i, e in enumerate(flat_golden) if filt(e)] ]

single_idx = filter_sentences_idx(lambda x: len(x) == 1)
multiple_idx = filter_sentences_idx(lambda x: len(x) > 1)



### F1-scores for clause detection

In [7]:
def get_f1(tp, fp, fn):
  precision = tp/(tp+fp)
  recall = tp/(tp+fn)
  f1 = (2*precision*recall)/(precision + recall)
  return [precision, recall, f1]

def get_stats(parser,golden,subset='overall'):
    
    tp = 0
    tn = len(adv_golden) - adv_parses if subset == 'overall' else 0
    fp = adv_parses if subset == 'overall' else 0
    fn = 0 
    
    for i in range(len(golden)):

        
            
        gp = golden[i]
        bp = parser[i]
        
        if subset == 'single':
            if len(gp) != 1:
                continue
        elif subset == 'multiple':
            if len(gp) == 1:
                continue
        
        if len(gp) == len(bp):
            tp += len(gp)
        elif len(gp) > len(bp):
            fn += len(gp) - len(bp)
            tp += len(bp)
        elif len(gp) < len(bp):
            fp += len(bp) - len(gp)
            tp += len(gp)
    
    print(f"---------Embedded Clause Detection, {subset} Clause F1---------")
    print("True Positives: ", tp, "\nFalse Positives: ", fp, "\nFalse Negatives: ", fn)
    print("Precision, Recall, F1: ", get_f1(tp, fp, fn))
    print("Accuracy: ", (tp+tn)/(tp+fp+fn+tn))
    print()



In [8]:
get_stats(flat_parsed,flat_golden,'single')
get_stats(flat_parsed,flat_golden,'multiple')
get_stats(flat_parsed,flat_golden,'overall')

---------Embedded Clause Detection, single Clause F1---------
True Positives:  357 
False Positives:  40 
False Negatives:  21
Precision, Recall, F1:  [0.8992443324937027, 0.9444444444444444, 0.9212903225806451]
Accuracy:  0.854066985645933

---------Embedded Clause Detection, multiple Clause F1---------
True Positives:  123 
False Positives:  7 
False Negatives:  26
Precision, Recall, F1:  [0.9461538461538461, 0.825503355704698, 0.881720430107527]
Accuracy:  0.7884615384615384

---------Embedded Clause Detection, overall Clause F1---------
True Positives:  480 
False Positives:  53 
False Negatives:  47
Precision, Recall, F1:  [0.900562851782364, 0.9108159392789373, 0.9056603773584905]
Accuracy:  0.8540145985401459



## Overall Feature Identification Acuraccy

Identification means that the parsed is able to extract the same features as the golden set

- Clause
- Clause type
- Predicate

In [9]:
def count_true_detections(parser,golden,subset='overall'):
    tp = 0  
    for i in range(len(golden)):
        gp = golden[i]
        bp = parser[i]
        if len(gp) == len(bp):
            tp += len(gp)
        elif len(gp) > len(bp):
            tp += len(bp)
        elif len(gp) < len(bp):
            tp += len(gp)
    return tp

def compare_data(parsed,golden,feature):
    matches = 0
    
    for parsed,gold in zip(parsed,golden):
        
        if feature=='predicate':
            gold_pred_lemmas = [get_predicate_string(gp['predicate']) for gp in gold]
            parsed_pred_lemmas = [get_predicate_string(parse['predicate']) for parse in parsed]
            
            for pred_lemma in gold_pred_lemmas:
                
                if pred_lemma in parsed_pred_lemmas:
                    matches += 1 
                    parsed_pred_lemmas.remove(pred_lemma)
        
        else:    
            gold_features = [gp[feature] for gp in gold]
            parsed_features = [parse[feature] for parse in parsed]
            
            for gold_feature in gold_features:
                
                if gold_feature in parsed_features:
                    matches += 1 
                    parsed_features.remove(gold_feature)
    return matches


In [10]:
def show_accuracy(parsed,golden):
    total_detects = count_true_detections(parsed,golden)

    # clause identification
    correct_clauses = compare_data(parsed,golden,'clause')
    print('clause accuracy', correct_clauses/total_detects)
    
    # Predicate detection
    correct_predicates = compare_data(parsed,golden,'predicate')
    print('predicate accuracy', correct_predicates/total_detects)
    
    # Type
    correct_types = compare_data(parsed,golden,'type')
    print('type accuracy', correct_types/total_detects)


### Overall Identification Accuracy

In [11]:
show_accuracy(flat_parsed,flat_golden)

clause accuracy 0.86875
predicate accuracy 0.9104166666666667
type accuracy 0.9604166666666667


### Single Clause Identification

In [12]:
parsed_single = [flat_parsed[i] for i in single_idx]
golden_single = [flat_golden[i] for i in single_idx]

show_accuracy(parsed_single,golden_single)

clause accuracy 0.8795518207282913
predicate accuracy 0.9663865546218487
type accuracy 0.969187675070028


### Multiple Clause Identification

In [13]:
parsed_multiple = [flat_parsed[i] for i in multiple_idx]
golden_multiple = [flat_golden[i] for i in multiple_idx]

show_accuracy(parsed_multiple,golden_multiple)

clause accuracy 0.8373983739837398
predicate accuracy 0.7479674796747967
type accuracy 0.9349593495934959
