Constituency parsing is a natural language processing task which involves parsing the sentences in a text into their constituent phrases or "constituents", following the formalism of context-free grammars. It is used in Stanza through a shift-reduce parser, which is a type of parser that iteratively shifts input onto a stack and reduces it to construct the parse tree stanfordnlp.github.io.

The ConstituencyProcessor in Stanza adds a constituency parse tree to each Sentence in the text. Bracket types depend on the treebank used, but custom models can support any set of labels as long as there is training data stanfordnlp.github.io.

In [3]:
import stanza

# config = {
#     # Comma-separated list of processors to use
# 	'processors': 'tokenize,mwt,pos',
#     # Language code for the language to build the Pipeline in
#     'lang': 'fr',
#     # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
#     # You only need model paths if you have a specific model outside of stanza_resources
# 	# 'tokenize_model_path': './fr_gsd_models/fr_gsd_tokenizer.pt',
# 	# 'mwt_model_path': './fr_gsd_models/fr_gsd_mwt_expander.pt',
# 	# 'pos_model_path': './fr_gsd_models/fr_gsd_tagger.pt',
# 	# 'pos_pretrain_path': './fr_gsd_models/fr_gsd.pretrain.pt',
#     # Use pretokenized text as input and disable tokenization
# 	# 'tokenize_pretokenized': True
# }

  from .autonotebook import tqdm as notebook_tqdm


In [80]:
class sentenceStruct:
    def __init__(self):
        self.condition = ""
        self.system = ""
        self.event = ""
        self.transition = ""
        self.action = ""
        self.state = ""

class SentenceSeg():
    def __init__(self):
        self.preposition = ""
        self.verb = ""
        self.NounPhrase = ""

    def isEmpty(self):
        if self.preposition == "" and self.verb == "" and self.NounPhrase == "":
            return True
        else:
            return False

In [90]:
class SentenceBuild():
    def __init__(self):
        self.segment = SentenceSeg()
        self.sentence = sentenceStruct()
        self.segments = []
        self.sentences = []

        self.states = ["store", "process"]
        self.state = ""

        self.tempData = ""
        self.data = []

    def process(self, depRel = None, val = None):
        if depRel == "VP":
            # if self.segment.isEmpty():
            #     self.segment = SentenceSeg()
            # else:
            #     pass
            self.state = "store"
            self.data.append(self.tempData.strip())
            self.tempData = ""
            

        elif val != None:
            self.state = "process"
            self.tempData += val + " "

        elif depRel == "End" and val == None:
            self.state = "store"
            self.data.append(self.tempData.strip())
            self.tempData = ""



In [47]:
"""
Tree datastructure
"""

from collections import deque, Counter
from enum import Enum
from io import StringIO
import itertools
import re
import warnings

# useful more for the "is" functionality than the time savings
CLOSE_PAREN = ')'
SPACE_SEPARATOR = ' '
OPEN_PAREN = '('

EMPTY_CHILDREN = ()

CONSTITUENT_SPLIT = re.compile("[-=#]")

# These words occur in the VLSP dataset.
# The documentation claims there might be *O*, although those don't
# seem to exist in practice
WORDS_TO_PRUNE = ('*E*', '*T*', '*O*')

In [92]:
def pretty_print(tree, normalize=None):
        """
        Print with newlines & indentation on each line

        
        Preterminals and nodes with all preterminal children go on their own line

        You can pass in your own normalize() function.  If you do,
        make sure the function updates the parens to be something
        other than () or the brackets will be broken
        """
        if normalize is None:
            normalize = lambda x: x.replace("(", "-LRB-").replace(")", "-RRB-")

        # val = []
        # arr = []

        sent = SentenceBuild()

        indent = 0
        with StringIO() as buf:
            stack = deque()
            stack.append(tree)
            while len(stack) > 0:
                node = stack.pop()
                
                # if val != []:
                #     tempData = ""
                #     for elem in val:
                #         tempData += elem + " "
                #     arr.append(tempData.strip())
                #     val = []

                if node is CLOSE_PAREN:
                    # if we're trying to pretty print trees, pop all off close parens
                    # then write a newline
                    while node is CLOSE_PAREN:
                        indent -= 1
                        buf.write(CLOSE_PAREN)
                        if len(stack) == 0:
                            node = None
                            break
                        node = stack.pop()
                    buf.write("\n")
                    if node is None:
                        break
                    stack.append(node)
                elif node.is_preterminal():
                    buf.write("  " * indent)
                    ####
                    sent.process(node.label, node.children[0].label)
                    ####
                    buf.write("%s%s %s%s" % (OPEN_PAREN, normalize(node.label), normalize(node.children[0].label), CLOSE_PAREN))
                    if len(stack) == 0 or stack[-1] is not CLOSE_PAREN:
                        buf.write("\n")
                elif all(x.is_preterminal() for x in node.children):
                    buf.write("  " * indent)
                    buf.write("%s%s" % (OPEN_PAREN, normalize(node.label)))
                    for child in node.children:
                        ####
                        sent.process(child.label, child.children[0].label)
                        ####
                        buf.write(" %s%s %s%s" % (OPEN_PAREN, normalize(child.label), normalize(child.children[0].label), CLOSE_PAREN))
                    buf.write(CLOSE_PAREN)
                    if len(stack) == 0 or stack[-1] is not CLOSE_PAREN:
                        buf.write("\n")
                else:
                    buf.write("  " * indent)
                    buf.write("%s%s\n" % (OPEN_PAREN, normalize(node.label)))
                    ####
                    sent.process(node.label, None)
                    ####
                    stack.append(CLOSE_PAREN)
                    for child in reversed(node.children):
                        stack.append(child)
                    indent += 1

            buf.seek(0)
            ####
            sent.process("End", None)
            ####
            return buf.read(), sent

In [49]:
processor_config = {
    'tokenize': 'default',
    'mwt': 'default',
    'pos' : 'default',
    'lemma' : 'default',
    'depparse' : 'default',
    'constituency' : 'wsj_bert',
    'ner' : ["CoNLL03", "aeroBERT-NER "]
}

# https://www.researchgate.net/publication/371428620_SafeAeroBERT_Towards_a_Safety-Informed_Aerospace-Specific_Language_Model?_tp=eyJjb250ZXh0Ijp7ImZpcnN0UGFnZSI6Il9kaXJlY3QiLCJwYWdlIjoiX2RpcmVjdCJ9fQ

# sentence tockenizers
sentence_tokenize = stanza.Pipeline(lang='en', processors=processor_config)
# sentence_tokenize_no_split = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'}, tokenize_no_ssplit=True)
# sentence_preTokenised = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'}, tokenize_pretokenized=True)




2023-09-06 15:12:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 4.42MB/s]                    
2023-09-06 15:12:10 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| lemma        | combined |
| constituency | wsj_bert |
| depparse     | combined |
| sentiment    | sstplus  |
| ner          | conll03  |

2023-09-06 15:12:10 INFO: Using device: cpu
2023-09-06 15:12:10 INFO: Loading: tokenize
2023-09-06 15:12:10 INFO: Loading: pos
2023-09-06 15:12:10 INFO: Loading: lemma
2023-09-06 15:12:10 INFO: Loading: constituency
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaMode

In [50]:
def sentTockenize(sentence):
    tokens = sentence_tokenize(sentence)
    return tokens

In [97]:
sentence = "The transition from CSWS OFF to CSWS ON can be performed by the driver or automatically"

doc = sentTockenize(sentence)
tree = doc.sentences[0].constituency

out, array = pretty_print(tree)
print(out)
pass

(ROOT
  (S
    (NP
      (NP (DT The) (NN transition))
      (PP
        (IN from)
        (NP (NNP CSWS) (RB OFF)))
      (PP
        (IN to)
        (NP (NNP CSWS) (IN ON))))
    (VP
      (MD can)
      (VP
        (VB be)
        (VP
          (VBN performed)
          (PP
            (PP
              (IN by)
              (NP (DT the) (NN driver)))
            (CC or)
            (ADVP (RB automatically))))))))

