Skip to content

Commit

Permalink
cgel.py conversion to PTB bracketing: include punctuation by default
Browse files Browse the repository at this point in the history
  • Loading branch information
nschneid committed Jul 3, 2024
1 parent 73f97be commit a197905
Showing 1 changed file with 23 additions and 12 deletions.
35 changes: 23 additions & 12 deletions cgel.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ def __init__(self, deprel: str, constituent: str, head: int, text: Optional[str]
else:
self.constituent = constituent
self.label = None
self.text = text
self.head = head
self.prepunct = []
self.postpunct = []
self.correct = None
self.substrings = None
self.note = None
self.xpos = None
self.text: Optional[str] = text
self.head: int = head
self.prepunct: list[str] = []
self.postpunct: list[str] = []
self.correct: Optional[str] = None
self.substrings: Optional[list[str]] = None
self.note: Optional[str] = None
self.xpos: Optional[str] = None
self._lemma = None # UD lemma

# coindexation nodes (i.e. gaps) should only hold a label
Expand Down Expand Up @@ -255,7 +255,7 @@ def __repr__(self):

class Tree:
def __init__(self):
self.tokens = {}
self.tokens: dict[int,Node] = {}
self.children = defaultdict(list)
self.labels = {}
self.heads = {}
Expand Down Expand Up @@ -359,12 +359,23 @@ def draw(self, include_metadata: bool=False):
result += f'# {k} = {v}\n'
return result + self.draw_rec(self.get_root(), 0)

def ptb_rec(self, head: int, depth: int):
result = self.tokens[head].ptb()
if self.tokens[head].constituent != 'GAP':
def ptb_rec(self, head: int, depth: int, punct=True):
result = ''
node = self.tokens[head]
if punct:
for p in node.prepunct:
p = p.replace('(', '-LRB-').replace(')', '-RRB-')
result += f'({p} {p}) ' # add constit for punctuation
result += node.ptb() # main contents of this node
if node.constituent != 'GAP':
# recursion to child nodes
for i in self.children[head]:
result += ' ' + self.ptb_rec(i, depth + 1)
result += ')'
if punct:
for p in node.postpunct:
p = p.replace('(', '-LRB-').replace(')', '-RRB-')
result += f' ({p} {p})' # add constit for punctuation
return result

def ptb(self, include_metadata=False):
Expand Down

0 comments on commit a197905

Please sign in to comment.