From 9f6a71749eaf84d7e54c1226eaa20827ebd13eec Mon Sep 17 00:00:00 2001 From: Perceval Wajsburt Date: Tue, 23 Nov 2021 22:07:59 +0100 Subject: [PATCH] Resolves undesirable fusions of adjacent entities The current script merges adjacent entities (i.e. no word in between) if they share the same label. So `B-label I-label B-label O` becomes `B-label I-label I-label O`. Instead of comparing the labels, the scripts now compares the annotation objects directly --- tools/anntoconll.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/anntoconll.py b/tools/anntoconll.py index adcd334c3..318ef25be 100644 --- a/tools/anntoconll.py +++ b/tools/anntoconll.py @@ -170,6 +170,7 @@ def relabel(lines, annotations): offset_label[i] = tb prev_label = None + prev_tb = None for i, l in enumerate(lines): if not l: prev_label = None @@ -178,20 +179,23 @@ def relabel(lines, annotations): # TODO: warn for multiple, detailed info for non-initial label = None + tb = None for o in range(start, end): if o in offset_label: if o != start: print('Warning: annotation-token boundary mismatch: "%s" --- "%s"' % ( token, offset_label[o].text), file=sys.stderr) label = offset_label[o].type + tb = offset_label[o] break if label is not None: - if label == prev_label: + if tb == prev_tb: tag = 'I-' + label else: tag = 'B-' + label prev_label = label + prev_tb = tb lines[i] = [tag, start, end, token]