# Parsing and Tree Expansion Utilities

This notebook contains two sets of utilities:

- Tree expansion/trigrams (from `expand_trees.py`): helpers for generating local structural trigrams from a Penn-style parse tree.
- Benepar parsing (from `parse_wmt.py`): helpers for building a SpaCy+Benepar pipeline and parsing sentences/files.

Quick usage:
- For trigrams: create an `nltk.Tree` from a bracketed string and iterate `generate_trigrams(tree)`.
- For parsing: `nlp = load_benepar()` then parse sentences and read `_.parse_string` from the first sentence of the Doc.

Dependencies for parsing demo: `spacy`, `benepar`, and models `en_core_web_sm` and `benepar_en3`.


In [1]:
!pip3 install nltk spacy benepar

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://nexus.corp.indeed.com/repository/pypi/simple
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
"""
Generate trigrams of the form:
ParentLabel -> Child₁ Child₂ … Childₙ , CurrentLabel , CurrentChildrenExpansion
from a bracketed constituency parse read from STDIN.
"""

from nltk import Tree


def child_repr(node: Tree) -> str:
    """
    Return a leaf-stripped representation of a node suitable for
    the 'children' part of the trigram, e.g.  (DT)  or  (NP (JJ) (NN)).
    """
    # Leaf: omit the surface word entirely
    if isinstance(node, str):
        return ""
    # Pre-terminal: POS tag with a single word child
    if len(node) == 1 and isinstance(node[0], str):
        return f"({node.label()})"
    # Internal node: show its label plus its own children's representations
    inner = " ".join(child_repr(c) for c in node)
    return f"({node.label()} {inner})"


def expansion(node: Tree) -> str:
    """Return space-separated representations of *all* immediate children."""
    return " ".join(child_repr(c) for c in node)


def generate_trigrams(tree: Tree, sent_id=None):
    """Yield trigram strings in the required format."""
    for parent in tree.subtrees():
        # List of this parent's immediate children labels
        child_labels = " ".join(c.label() for c in parent if isinstance(c, Tree))
        for child in parent:
            if isinstance(child, Tree):  # ignore pre-terminals when they are 'Current'
                exp = expansion(child)
                if exp:
                    trigram = f"{parent.label()} -> {child_labels}, {child.label()}, {exp}"
                    if sent_id is not None:
                        yield f"SENT_{sent_id}\t{trigram}"
                    else:
                        yield trigram

In [3]:
"""
Read one-sentence-per-line text, parse each sentence with Benepar,
and write the corresponding Penn-style constituency trees.
"""

from pathlib import Path

# ---- choose a parser implementation ----------------------------------------
# Here we use SpaCy + Benepar.  Feel free to swap in CoreNLP, Berkeley, etc.
import spacy, benepar

def load_benepar():
    """Return a SpaCy pipeline that yields `token._.parse_string`."""
    # Try full English model; fall back to a blank pipeline with sentence splitter
    try:
        nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
    except Exception:
        nlp = spacy.blank("en")
        if "sentencizer" not in nlp.pipe_names:
            nlp.add_pipe("sentencizer")
    # Ensure benepar model is available and added
    try:
        nlp.add_pipe("benepar", config={"model": "benepar_en3"})
    except Exception:
        try:
            if hasattr(benepar, "download"):
                benepar.download("benepar_en3")
            nlp.add_pipe("benepar", config={"model": "benepar_en3"})
        except Exception as e:
            raise RuntimeError(
                "Benepar model 'benepar_en3' is required and could not be installed."
            ) from e
    return nlp

def parse_file(in_path: Path, out_path: Path, nlp):
    with in_path.open(encoding="utf8") as fin, out_path.open("w", encoding="utf8") as fout:
        for line_no, sent in enumerate(fin, 1):
            sent = sent.strip()
            if not sent:
                fout.write("\n")
                continue
            doc = nlp(sent)
            # Benepar attaches the parse to the *first* sentence in the Doc
            tree = next(doc.sents)._.parse_string
            fout.write(tree + "\n")




In [4]:
# Example: generate trigrams from a simple parse
from nltk import Tree

example = "(S (NP (DT The) (NN boy)) (VP (VBZ is) (VP (VBG playing) (PP (IN with) (NP (DT the) (NN ball))))))"
tree = Tree.fromstring(example)
for tri in generate_trigrams(tree, sent_id=1):
    print(tri)

SENT_1	S -> NP VP, NP, (DT) (NN)
SENT_1	S -> NP VP, VP, (VBZ) (VP (VBG) (PP (IN) (NP (DT) (NN))))
SENT_1	VP -> VBZ VP, VP, (VBG) (PP (IN) (NP (DT) (NN)))
SENT_1	VP -> VBG PP, PP, (IN) (NP (DT) (NN))
SENT_1	PP -> IN NP, NP, (DT) (NN)


In [6]:
# Example: parse a sentence with SpaCy + Benepar
# Requires: pip install spacy benepar && python -m spacy download en_core_web_sm
# If benepar model is missing: in Python, run: benepar.download('benepar_en3')

try:
    nlp = load_benepar()
    doc = nlp("The boy is playing with the ball")
    parse_str = next(doc.sents)._.parse_string
    print(parse_str)
except Exception as e:
    print("Parsing demo failed. Ensure spacy, benepar, and models are installed.")
    print("Error:", e)


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(S (NP (DT The) (NN boy)) (VP (VBZ is) (VP (VBG playing) (PP (IN with) (NP (DT the) (NN ball))))))


In [None]:
# Parse the dataset file row-wise with progress and write bracketed parses
from pathlib import Path
import time, string

try:
    from tqdm.auto import tqdm
except Exception:
    tqdm = None

# Input file: one sentence per line
in_path = Path("/Users/vvalluri/research/indic_comp/iwslt_2014_train/train.en.for.features")
# Output file: append .parse to the input filename
out_path = Path(str(in_path) + ".parse")

# Count total lines for progress (fast for 160k)
try:
    total_lines = sum(1 for _ in in_path.open(encoding="utf8"))
except Exception:
    total_lines = None

start = time.time()
nlp = load_benepar()
print("Pipeline ready. Parsing...")

# Stream line-by-line, updating progress
if tqdm and total_lines:
    pbar = tqdm(total=total_lines, desc="Parsing", unit="sent")
else:
    pbar = None

with in_path.open(encoding="utf8") as fin, out_path.open("w", encoding="utf8") as fout:
    for line in fin:
        sent = line.strip()
        sent = sent.rstrip(string.punctuation)

        if not sent:
            fout.write("\n")
        else:
            doc = nlp(sent)
            tree = next(doc.sents)._.parse_string
            fout.write(tree + "\n")
        if pbar:
            pbar.update(1)
        
if pbar:
    pbar.close()

elapsed = time.time() - start
print(f"Done. Wrote parses to: {out_path}")
print(f"Elapsed: {elapsed/60:.1f} minutes (approx)")


Pipeline ready. Parsing...


Parsing:   0%|          | 0/160239 [00:00<?, ?sent/s]

In [9]:
# Generate trigrams from the parsed output file with progress
from pathlib import Path
from nltk import Tree

try:
    from tqdm.auto import tqdm
except Exception:
    tqdm = None

parsed_path = Path("/Users/vvalluri/research/indic_comp/parsed_trees")
out_trigrams = Path(str(parsed_path) + ".trigrams")

# Count lines for progress bar
try:
    total_lines = sum(1 for _ in parsed_path.open(encoding="utf8"))
except Exception:
    total_lines = None

if tqdm and total_lines:
    pbar = tqdm(total=total_lines, desc="Trigrams", unit="tree")
else:
    pbar = None

with parsed_path.open(encoding="utf8") as fin, out_trigrams.open("w", encoding="utf8") as fout:
    for sent_id, line in enumerate(fin, 1):
        tree_str = line.strip()
        if not tree_str:
            fout.write(f"SENT_{sent_id}\t\n")
        else:
            tree = Tree.fromstring(tree_str)
            for tri in generate_trigrams(tree, sent_id=sent_id):
                fout.write(tri + "\n")
        if pbar:
            pbar.update(1)

if pbar:
    pbar.close()

print(f"Done. Wrote trigrams to: {out_trigrams}")

Parsing:   0%|          | 0/160239 [00:19<?, ?sent/s]
Trigrams: 100%|██████████| 160239/160239 [00:26<00:00, 6077.11tree/s]

Done. Wrote trigrams to: /Users/vvalluri/research/indic_comp/parsed_trees.trigrams



