In [2]:
import benepar, spacy

In [3]:
nlp = spacy.load('en_core_web_trf')
if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3_large"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3_large"})

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [46]:
def replace_brackets(parsed_string):
    # Replace round brackets with square brackets
    replaced_string = parsed_string.replace('(', '[').replace(')', ']')
    
    # Wrap the string with \begin{forest} and \end{forest}
    final_string = '\\begin{forest} ' + replaced_string + ' \\end{forest}'
    
    return final_string

In [96]:
def get_pred_info(span):
    pred = []
    if 'SBAR' in span._.labels:
        return []
    children = list(span._.children)
    if len(children) == 0:
        return []
    for token in span:
        if 'SBAR' in token._.parent._.labels or 'S' in token._.parent._.labels or 'SBAR' in token._.labels or 'S' in token._.labels:
            break
        pred += [{'str': token.text, 'lemma': token.lemma_, 'POS': token.pos_}]            
    return list(filter(lambda x: x['POS'] in ['VERB', 'ADP', 'ADJ', 'AUX'], pred))

def get_sentence(span):
    parent = span._.parent
    if parent == None :
        return span
    else:
        return get_sentence(parent)

def get_predicate(span):
    #information for each token, from spacy
    final_pred = []
    token_annots = [{'str': token.text, 'lemma': token.lemma_, 'POS': token.pos_} for token in span]
    pos_tags = [token.pos_ for token in span]

    #we only want verbs, prepositions, adjectives, and (only if the predicate contains an adjective) auxiliaries
    for token in token_annots:
        pos = token['POS']
        if pos in ['VERB', 'ADP', 'ADJ'] or ("ADJ" in pos_tags and pos == "AUX"):
            final_pred.append(token)
    return final_pred
    

def VP_parent(span):
    sbar_idx = span[0].i
    parent = span._.parent
    if parent == None:
        return (False,None)
    parent_label = parent._.labels
    if "VP" in parent_label:
        predicate = get_predicate(get_sentence(span)[parent[0].i:sbar_idx+1])
        # predicate = get_pred_info(parent)
        if len(predicate) == 1:
            if predicate[0]['str']=='is':
                return (False,None)
        return (True,predicate)
    elif "NP" in parent_label:
        return (False,None)
    else:
        return VP_parent(parent)    

def get_SBAR_spans(span):
    # if 'SBAR' in span._.labels:
    #     return [span]
    children = list(span._.children)
    spans = []
    if len(children)==0:
        return spans
    for child in children:
        if 'SBAR' in child._.labels:
            spans += [child]
        else:
            spans+= get_SBAR_spans(child)
    return spans

def get_S_parent(span):
    parent = span._.parent
    if parent == None:
        return span
    parent_label = parent._.labels
    if "S" in parent_label:
        return parent
    else:
        return get_S_parent(parent)        

def get_SBAR_clause(span):
    if 'S' in span._.labels:
        return [span]
    children = list(span._.children)
    clauses = []
    if len(children)==0:
        return clauses
    for child in children:
        if 'S' in child._.labels:
            clauses += [child]
        else:
            clauses+= get_SBAR_clause(child)
    return clauses

def get_clause_type(span):
        """Returns the type of the embedded clause if there is one in the given sentence.
            Identifies clauses as one of 4 clause types
                - finite declarative
                - finite polar interrogative
                - finite constituent interrogative
                - finite alternative interrogative
            Otherwise the clause will be labeled with "other".

        :seg: spacy.tokens.span.span
            Parsed representation of of text string.
        :has_predicate: (bool,spacy.tokens.span.span)
        :clause: str
            Embedding predicate if it has been previously found.
            Carry the last finding in in case predicate has already been found.
        :returns: str
            Embedded clause type label
        """
        # Convert to string to apply heuristic checks
        clause_str = str(span).lower()

        first_word = str(list(span._.children)[0]).lower()

        # Check for polar and alternative interrogatives
        if 'whether' in first_word: 
            if 'or not' in clause_str:
                return 'polar'
            if ' or ' in clause_str:
                return 'alternative'
            else:
                return 'polar'

        # Check for constituent
        if any([word in first_word for word in ['who', 'what', 'when', 'where', 'why', 'how','which']]):
               return 'constituent'

        return 'declarative'

In [97]:
#extract the string, lemma, and pos tags for the relevant tokens in the predicate
def get_pred_info(pred_str):
    final_pred = []
    if type(pred_str) != str:
        pred_str = str(pred_str)
    doc = nlp(pred_str)

    #information for each token, from spacy
    token_annots = [{'str': token.text, 'lemma': token.lemma_, 'POS': token.pos_} for token in doc]
    pos_tags = [token.pos_ for token in doc]

    #we only want verbs, prepositions, adjectives, and (only if the predicate contains an adjective) auxiliaries
    for token in token_annots:
        pos = token['POS']
        if pos in ['VERB', 'ADP', 'ADJ'] or ("ADJ" in pos_tags and pos == "AUX"):
            final_pred.append(token)

    return final_pred

In [98]:
def parse_SBAR_flat(span):
    SBAR_spans = get_SBAR_spans(span)
    clauses = []
    for sbar in SBAR_spans:
        parent = VP_parent(sbar)
        first_word = str(list(sbar._.children)[0]).lower()
        if len(get_SBAR_spans(sbar)) > 0:
            clauses += parse_SBAR_flat(sbar)    
        if not parent[0] or first_word in ['because', 'since', 'while']:
            continue
        clauses += [{'predicate': parent[1],
            'type': get_clause_type(sbar),
            'clause' : sbar
           }]
    return clauses

def parse_SBAR_clause(span):
    SBAR_spans = self.get_SBAR_spans(span)
    clauses = []
    
    for sbar in SBAR_spans:
        parent = self.VP_parent(sbar)
        first_word = str(list(sbar._.children)[0]).lower()
        if len(self.get_SBAR_spans(sbar)) > 0:
            clauses += self.parse_SBAR_clause(sbar)
        if not parent[0] or first_word in ['because', 'since', 'while','as']:
            continue
        clauses += [{'predicate': parent[1],
                     'type': self.get_clause_type(sbar),
                     'clause' : str(sbar)
                     }]
        
def parse_clauses(span):
    return {"sentence": span, 
            "embedded clauses": parse_SBAR_flat(span)
           }

In [99]:
# test_doc = nlp("I think this is something very difficult while Bill thinks it is very simple")
test_doc = nlp("He says how he much enjoyed working for the council")
test_span = list(test_doc.sents)[0]
parse_clauses(test_span)
# print(replace_brackets(list(test_doc.sents)[0]._.parse_string))
# [(token._.parent._.labels,token) for token in get_SBAR_spans(test_span)[0]._.parent]
# [token for token in ]

parse range (1, 3)
Getting predicate in: says how


{'sentence': He says how he much enjoyed working for the council,
 'embedded clauses': [{'predicate': [{'str': 'says',
     'lemma': 'say',
     'POS': 'VERB'}],
   'type': 'constituent',
   'clause': how he much enjoyed working for the council}]}

# Test Parser

In [16]:
test_doc = nlp("Therefore, those who have not received the overtime pay they are entitled to may want to learn more about whether they are able to recover that compensation through a wage and hour claim.")
test_span = list(test_doc.sents)[0]
parse_clauses(test_span)
# [(token,token._.labels) for token in test_span]
print(replace_brackets(list(test_doc.sents)[0]._.parse_string))
# [get_pred_info(VP_parent(s)) for s in get_SBAR_spans(test_span)]
# [VP_parent(s) for s in get_SBAR_spans(test_span)]

{'sentence': Therefore, those who have not received the overtime pay they are entitled to may want to learn more about whether they are able to recover that compensation through a wage and hour claim.,
 'embedded clauses': []}

In [30]:
test_doc = nlp("Not many people know whether Mary is telling the truth")
test_span = list(test_doc.sents)[0]
# [parse_clauses(sent) for sent in test_doc.sents]
# print(replace_brackets(list(test_doc.sents)[0]._.parse_string))

Not many people

In [12]:
[x._.labels for x in list(list(test_span._.children)[0]._.children)]

[(), ('SBAR',)]

In [321]:
x.iloc[2]['embedded clauses']

[{'predicate': [{'str': 'know', 'lemma': 'know', 'POS': 'VERB'}],
  'type': 'declarative',
  'clause': it is embedded within many other clauses,
  'embedded clauses': []}]

In [160]:
VP_parent(get_SBAR_spans(test_span)[0])

(True,
 [{'string': decide, 'lemma': 'decide', 'POS': 'VERB', 'parse index': 3},
  {'string': in, 'lemma': 'in', 'POS': 'ADP', 'parse index': 4},
  {'string': the, 'lemma': 'the', 'POS': 'DET', 'parse index': 5}])