In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install benepar
!pip install spacy
!python -m spacy download en_core_web_lg

Collecting benepar
  Downloading benepar-0.2.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-struct>=0.5 (from benepar)
  Downloading torch_struct-0.5-py3-none-any.whl (34 kB)
Collecting sentencepiece>=0.1.91 (from benepar)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers[tokenizers,torch]>=4.2.2->benepar)
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: benepar
  Building wheel for benepar (setup.py) ... [?25l[?25hdone
  Created wheel for benepar: filename=benepar-0.2.0-py3-none-any.whl size=37624 sha256=33e1b5f4da967f38f69fa7fa60171a0e42e6ec83f2b179d4e796054c

In [None]:
import benepar, spacy
import os
import re
from typing import List, Set, Dict, Tuple
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
import warnings
import csv
import pandas as pd
import math

warnings.filterwarnings('ignore')

In [None]:
benepar.download('benepar_en3')
nlp = spacy.load('en_core_web_lg')

if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Unzipping models/benepar_en3.zip.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
decl_samples = open('/content/drive/My Drive/samples/finite declarative clauses.txt').read().replace('\n', ' ')
pol_samples = open('/content/drive/My Drive/samples/finite polar interrogative clauses.txt').read().replace('\n', ' ')
const_samples = open('/content/drive/My Drive/samples/finite constituent interrogative clauses.txt').read().replace('\n', ' ')
alt_samples = open('/content/drive/My Drive/samples/finite alternative interrogative clauses.txt').read().replace('\n', ' ')
adv_samples = open('/content/drive/My Drive/samples/adversarial.txt').read().replace('\n', ' ')
#common_crawl_sample = open('/content/drive/My Drive/samples/cc_en_head-0000_sample.txt').read().replace('\n', ' ')

In [None]:
print(adv_samples)

﻿I have every confidence that you’ll do well. It’s true whether you like it or not. I’ll be happy whether or not it’s true. Regardless of whether or not you’ve won, I’m proud. John is happy whichever student won the race. John told Mary, who is a good listener. John spilt what Mary was cooking. Al took what Bo left behind. Al is happy which is the best. I ate when you were out.


In [None]:
all_examples = decl_samples + " " + pol_samples + " " + const_samples + " " + alt_samples + " " + adv_samples
doc5 = nlp(all_examples)
#for sent in doc5.sents:
#  pos = [token.pos_ for token in sent]
#  print(sent, pos)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
#verb matching - the baseline heuristics
verb_matcher = Matcher(nlp.vocab)

verb_patterns = []
SINGLES = ["believe", "think", "know", "wonder", "hope", "fear", "like", "regret", "see", "ask", "say", "tell", "check", "decide", "determine", "consider"]
DOUBLES = ["certain", "happy", "curious"]

for sing in SINGLES:
  pattern = [{"LEMMA": sing, "POS": "VERB"}]
  verb_patterns.append(pattern)

for doub in DOUBLES:
  pattern = [{"LEMMA": "be", "POS": "AUX"}, {"OP": "?"}, {"OP": "?"}, {"OP": "?"}, {"LEMMA": doub}]
  verb_patterns.append(pattern)

verb_matcher.add("embedding_verb", verb_patterns)

#clause matching
clause_matcher = Matcher(nlp.vocab)
pattern_d = [{"TEXT": "that", "POS": "SCONJ"}]
clause_matcher.add("declarative", [pattern_d])

pattern_i1 = [{"TEXT": "whether", "POS": "SCONJ"}]
clause_matcher.add("polar", [pattern_i1])

pattern_c1 = [{"TEXT": "who"}]
pattern_c2 = [{"TEXT": "what"}]
pattern_c3 = [{"TEXT": "when"}]
pattern_c4 = [{"TEXT": "where"}]
pattern_c5 = [{"TEXT": "why"}]
pattern_c6 = [{"TEXT": "how"}]
pattern_c7 = [{"TEXT": "which"}]
clause_matcher.add("constituent", [pattern_c1, pattern_c2, pattern_c3, pattern_c4, pattern_c5, pattern_c6, pattern_c7])


#this function takes nlp("concatenated sentences").sents as input and prints annotations
def get_baseline(sentences, print_negs = False):
  embeddings = []

  for sent in sentences:
    embedded_clauses = []

    #get embedding predicates
    verb_matches = verb_matcher(sent)
    for match_id, start, end in verb_matches:
        emb = dict.fromkeys(["sentence", "predicate", "clause", "type"])
        emb["predicate"] = dict.fromkeys(["token", "pos", "preposition"])

        #default values for sentence and for predicate annotation
        emb["sentence"] = sent.text
        emb["predicate"]["token"] = sent[start].text
        emb["predicate"]["pos"] = "VERB"

        #doubles values for predicate annotation
        if sent[end-1].text in ["happy", "certain", "curious"]:
          emb["predicate"]["pos"] = "ADJ"
          emb["predicate"]["token"] = sent[end-1].text

        #edge cases where there is a preposition, adverb, pronoun, or noun between the verb and the clause
        if sent[end].pos_ == "ADP":
          emb["predicate"]["preposition"] = sent[end].text
          end += 1

        if sent[end].pos_ in ["ADV", "PRON", "PROPN"] and (not end < len(sent)) and (sent[end+1].pos_ in ["SCONJ", "PRON"]):
          end += 1

        #get the potential clause
        clause = sent[end:-1]
        emb["clause"] = clause.text

        #find clause type
        clause_matches = clause_matcher(clause)
        clause_pos = [token.pos_ for token in clause]

        for match_id, start, end in clause_matches:
          clause_type = nlp.vocab.strings[match_id]
          emb["type"] = clause_type

          #separate polars from alternatives
          if clause_type == "polar":
            if (" or " in clause.text) and not (" or not " in clause.text) and not (" or not." in sent[end:].text):
              emb["type"] = "alternative"

          break

        #type declaraties without marks
        if clause_matches == [] and (clause_pos.count("VERB") ==1 or clause_pos.count("AUX") ==1):
          emb["type"] = "declarative"

        #if it was a genuine embedded clause add to the sentence's set of annotations
        if emb["type"] != None:
          embedded_clauses.append(emb)

    ##tn or fn
    #if embedded_clauses == []:
    #  print("\n", sent, [token.lemma_ for token in sent], "\n")

    #tps and fps
    #else:
    if embedded_clauses != []:
      print(embedded_clauses)
      embeddings.append(embedded_clauses)

    elif print_negs == True:
      embeddings.append([{'sentence': sent}])
      print([{'sentence': sent}])

  return embeddings

In [None]:
#get output file for samples
ex = get_baseline(doc5.sents, print_negs=True)

with open('/content/drive/My Drive/samples/baseline_samples.csv', 'w', newline='') as csvfile:
  bwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  bwriter.writerow(["line_number", "sents", "has_clause", "type",	"predicate"])
  for i, s in enumerate(ex):
    first = s[0]
    if len(first.keys()) > 1:
      bwriter.writerow([i+1, '"' + str(first['sentence']) + '"', 1, first['type'], first['predicate']['token']])
    else:
      bwriter.writerow([i+1, '"' + str(first['sentence']) + '"', 0])

[{'sentence': 'John believes that the movie was unpleasant.', 'predicate': {'token': 'believes', 'pos': 'VERB', 'preposition': None}, 'clause': 'that the movie was unpleasant', 'type': 'declarative'}]
[{'sentence': 'John believes the movie was unpleasant.', 'predicate': {'token': 'believes', 'pos': 'VERB', 'preposition': None}, 'clause': 'the movie was unpleasant', 'type': 'declarative'}]
[{'sentence': 'John saw that Mary didn’t like the movie.', 'predicate': {'token': 'saw', 'pos': 'VERB', 'preposition': None}, 'clause': 'that Mary didn’t like the movie', 'type': 'declarative'}]
[{'sentence': 'I hoped you’d like the movie.', 'predicate': {'token': 'hoped', 'pos': 'VERB', 'preposition': None}, 'clause': 'you’d like the movie', 'type': 'declarative'}]
[{'sentence': 'John was happy that the movie was good.', 'predicate': {'token': 'happy', 'pos': 'ADJ', 'preposition': None}, 'clause': 'that the movie was good', 'type': 'declarative'}]
[{'sentence': 'John was happy the movie was good.', '

In [None]:
#sets - natural

decl_gold = pd.read_csv('/content/drive/My Drive/samples/declarative_golden_set_checked.csv')
pol_gold = pd.read_csv('/content/drive/My Drive/samples/polar_golden_set_checked.csv')
const_gold = pd.read_csv('/content/drive/My Drive/samples/constituent_golden_set_checked.csv')
alt_gold = pd.read_csv('/content/drive/My Drive/samples/alternative_golden_set_checked.csv')
adv_gold = pd.read_csv('/content/drive/My Drive/samples/adversarials_golden_set_checked.csv')

golds = [decl_gold, pol_gold, const_gold, alt_gold, adv_gold]


In [None]:
#sets - samples
out_samp = pd.read_csv('/content/drive/My Drive/samples/baseline_samples.csv')

dec_samp = pd.read_csv('/content/drive/My Drive/samples/golden_sample_declarative.csv')
pol_samp = pd.read_csv('/content/drive/My Drive/samples/golden_sample_polar.csv')
con_samp = pd.read_csv('/content/drive/My Drive/samples/golden_sample_constituent.csv')
alt_samp = pd.read_csv('/content/drive/My Drive/samples/golden_sample_alternative.csv')
adv_samp = pd.read_csv('/content/drive/My Drive/samples/golden_sample_adversarial.csv')

samples = [dec_samp, pol_samp, con_samp, alt_samp, adv_samp]

In [None]:
#get output files for golden sets

for i, typ in enumerate(['declarative', 'polar', 'constituent', 'alternative', 'adversarial']):
  filepath = '/content/drive/My Drive/samples/' + 'baseline_' + typ + '_golden.csv'
  line_numbers = list(golds[i]['line_number'])

  with open(filepath, 'w', newline='') as csvfile:
    bwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    bwriter.writerow(["line_number", "sents", "has_clause", "type",	"predicate"])

    for j, sen in enumerate(list(golds[i]['sents'])):
      sen_doc = nlp(sen)
      out = get_baseline(sen_doc.sents, print_negs=True)
      first = out[0][0]

      if len(first.keys()) > 1:
        bwriter.writerow([line_numbers[j], '"' + str(first['sentence']) + '"', 1, first['type'], first['predicate']['token']])

      else:
        bwriter.writerow([line_numbers[j], '"' + str(first['sentence']) + '"', 0])

[{'sentence': Luminosity Gaming may be about to recruit a new CS:GO player, as VPEsports reports that Ricardo 'boltz'}]
[{'sentence': Prass is set to rejoin the side to replace Gustavo 'yeL' Knittel.}]
[{'sentence': 'I have always loved fashion and believe that it plays an integral role in self expression', 'predicate': {'token': 'believe', 'pos': 'VERB', 'preposition': None}, 'clause': 'that it plays an integral role in self', 'type': 'declarative'}]
[{'sentence': Being a Conroe and Woodlands Family Photographer, I find that emotions are the most noticeable thing in my photographs (especially since I shoot with a candid approach to my sessions)}]
[{'sentence': 'I can honestly say that that is precisely what happened on this day', 'predicate': {'token': 'say', 'pos': 'VERB', 'preposition': None}, 'clause': 'that that is precisely what happened on this', 'type': 'declarative'}]
[{'sentence': 'I have decided that I want her to be my pen pal and take it back old school', 'predicate': {'to

In [None]:
#predicted sets - natural
out_dec = pd.read_csv('/content/drive/My Drive/samples/baseline_declarative_golden.csv')
out_pol = pd.read_csv('/content/drive/My Drive/samples/baseline_polar_golden.csv')
out_con = pd.read_csv('/content/drive/My Drive/samples/baseline_constituent_golden.csv')
out_alt = pd.read_csv('/content/drive/My Drive/samples/baseline_alternative_golden.csv')
out_adv = pd.read_csv('/content/drive/My Drive/samples/baseline_adversarial_golden.csv')

In [None]:
#stats for evaluation

#contains clause, overall - precision and recall
def contains_clause_f1(pred_list, true_list):
  tp = 0
  fp = 0
  tn = 0
  fn = 0

  for s in range(len(pred_list)):
    if pred_list[s] == 0 and true_list[s] == 0:
      tn += 1
    if pred_list[s] == 1 and true_list[s] == 0:
      fp += 1
    if pred_list[s] == 0 and true_list[s] == 1:
      fn += 1
    if pred_list[s] == 1 and true_list[s] == 1:
      tp += 1

  precision = tp/(tp+fp)
  recall = tp/(tp+fn)
  f1 = (2*precision*recall)/(precision + recall)

  print("Overall Clause Prediction Precision: ", precision)
  print("Overall Clause Prediction Recall: ", recall)
  print("Overall Clause Prediction F1: ", f1)



#contains clause, per type - accuracy
def contains_clause_acc(typ, pred_list, true_list):
  correct = 0
  total = len(pred_list)

  for s in range(len(pred_list)):
    if pred_list[s] == 0 and true_list[s] == 0:
      correct += 1
    if pred_list[s] == 1 and true_list[s] == 1:
      correct += 1

  accuracy = correct/total

  print(typ, "Clause Prediction Accuracy: ", accuracy)



#type - accuracy
def type_acc(typ, pred_list, true_list):
  correct = 0
  total = 0

  for s in range(len(pred_list)):
    if type(pred_list[s]) == str:
      total += 1
      if true_list[s] == pred_list[s]:
        correct += 1

  accuracy = correct/total

  print(typ, "Type Prediction Accuracy: ", accuracy)



#predicate - accuracy
def predicate_acc(typ, pred_list, true_list):
  correct = 0
  total = 0

  for s in range(len(pred_list)):
    if type(pred_list[s]) == str:
      total += 1
      if true_list[s] == pred_list[s]:
        correct += 1

  accuracy = correct/total

  print(typ, "Predicate Prediction Accuracy: ", accuracy)


In [None]:
#get sample stats
full_pred = list(out_samp['has_clause'])
full_true = []

for i in samples:
  full_true = full_true + list(i['has_clause'])

contains_clause_f1(full_pred, full_true)

print()

dec_pred = list(out_samp['has_clause'])[:10]
pol_pred = list(out_samp['has_clause'])[10:20]
con_pred = list(out_samp['has_clause'])[20:30]
alt_pred = list(out_samp['has_clause'])[30:40]
adv_pred = list(out_samp['has_clause'])[40:50]

contains_clause_acc("Declarative", dec_pred, list(dec_samp['has_clause']))
contains_clause_acc("Polar", pol_pred, list(pol_samp['has_clause']))
contains_clause_acc("Constituent", con_pred, list(con_samp['has_clause']))
contains_clause_acc("Alternative", alt_pred, list(alt_samp['has_clause']))
contains_clause_acc("Adversarial", adv_pred, list(adv_samp['has_clause']))

print()

dec_pred = list(out_samp['type'])[:10]
pol_pred = list(out_samp['type'])[10:20]
con_pred = list(out_samp['type'])[20:30]
alt_pred = list(out_samp['type'])[30:40]

type_acc("Declarative", dec_pred, list(dec_samp['type']))
type_acc("Polar", pol_pred, list(pol_samp['type']))
type_acc("Constituent", con_pred, list(con_samp['type']))
type_acc("Alternative", alt_pred, list(alt_samp['type']))

print()

dec_pred = list(out_samp['predicate'])[:10]
pol_pred = list(out_samp['predicate'])[10:20]
con_pred = list(out_samp['predicate'])[20:30]
alt_pred = list(out_samp['predicate'])[30:40]

predicate_acc("Declarative", dec_pred, list(dec_samp['predicate']))
predicate_acc("Polar", pol_pred, list(pol_samp['predicate']))
predicate_acc("Constituent", con_pred, list(con_samp['predicate']))
predicate_acc("Alternative", alt_pred, list(alt_samp['predicate']))


Overall Clause Prediction Precision:  0.9047619047619048
Overall Clause Prediction Recall:  0.95
Overall Clause Prediction F1:  0.9268292682926829

Declarative Clause Prediction Accuracy:  0.9
Polar Clause Prediction Accuracy:  1.0
Constituent Clause Prediction Accuracy:  1.0
Alternative Clause Prediction Accuracy:  0.9
Adversarial Clause Prediction Accuracy:  0.6

Declarative Type Prediction Accuracy:  1.0
Polar Type Prediction Accuracy:  1.0
Constituent Type Prediction Accuracy:  1.0
Alternative Type Prediction Accuracy:  1.0

Declarative Predicate Prediction Accuracy:  1.0
Polar Predicate Prediction Accuracy:  1.0
Constituent Predicate Prediction Accuracy:  1.0
Alternative Predicate Prediction Accuracy:  1.0


In [None]:
#get golden stats
full_pred = list(out_dec['has_clause']) + list(out_pol['has_clause']) + list(out_con['has_clause']) + list(out_alt['has_clause']) + list(out_adv['has_clause'])
full_true = []

for i in golds:
  full_true = full_true + list(i['has_clause'])

contains_clause_f1(full_pred, full_true)

print()

dec_pred = list(out_dec['has_clause'])
pol_pred = list(out_pol['has_clause'])
con_pred = list(out_con['has_clause'])
alt_pred = list(out_alt['has_clause'])
adv_pred = list(out_adv['has_clause'])

contains_clause_acc("Declarative", dec_pred, list(decl_gold['has_clause']))
contains_clause_acc("Polar", pol_pred, list(pol_gold['has_clause']))
contains_clause_acc("Constituent", con_pred, list(const_gold['has_clause']))
contains_clause_acc("Alternative", alt_pred, list(alt_gold['has_clause']))
contains_clause_acc("Adversarial", adv_pred, list(adv_gold['has_clause']))

print()

dec_pred = list(out_dec['type'])
pol_pred = list(out_pol['type'])
con_pred = list(out_con['type'])
alt_pred = list(out_alt['type'])

type_acc("Declarative", dec_pred, list(decl_gold['type']))
type_acc("Polar", pol_pred, list(pol_gold['type']))
type_acc("Constituent", con_pred, list(const_gold['type']))
type_acc("Alternative", alt_pred, list(alt_gold['type']))

print()

dec_pred = list(out_dec['predicate'])
pol_pred = list(out_pol['predicate'])
con_pred = list(out_con['predicate'])
alt_pred = list(out_alt['predicate'])

predicate_acc("Declarative", dec_pred, list(decl_gold['predicate']))
predicate_acc("Polar", pol_pred, list(pol_gold['predicate']))
predicate_acc("Constituent", con_pred, list(const_gold['predicate']))
predicate_acc("Alternative", alt_pred, list(alt_gold['predicate']))


Overall Clause Prediction Precision:  0.9746835443037974
Overall Clause Prediction Recall:  0.5043668122270742
Overall Clause Prediction F1:  0.6647482014388489

Declarative Clause Prediction Accuracy:  0.4690265486725664
Polar Clause Prediction Accuracy:  0.5
Constituent Clause Prediction Accuracy:  0.5983606557377049
Alternative Clause Prediction Accuracy:  0.42696629213483145
Adversarial Clause Prediction Accuracy:  0.9459459459459459

Declarative Type Prediction Accuracy:  0.9433962264150944
Polar Type Prediction Accuracy:  0.8507462686567164
Constituent Type Prediction Accuracy:  0.958904109589041
Alternative Type Prediction Accuracy:  0.9736842105263158

Declarative Predicate Prediction Accuracy:  0.8867924528301887
Polar Predicate Prediction Accuracy:  0.8208955223880597
Constituent Predicate Prediction Accuracy:  0.7534246575342466
Alternative Predicate Prediction Accuracy:  0.8157894736842105
