# **Simple Yet Powerful: An Overlooked Architecture for Nested Named Entity Recognition - Statistics**

In [None]:
import numpy as np
from collections import defaultdict

**Function definitions**

In [None]:
# Function used to read a file in ConLL format. 
def read_file(path):
  f = open(path, 'r', encoding = 'utf-8').read()
  sents = [sent for sent in f.split('\n\n')]
  return sents

#Function used to obtain the tokens from the dataset.
def get_tokens(sents):
  tokens = []
  for sent in sents:
    for line in sent.splitlines():
      tokens.append(line.split()[0])
  return tokens

# Function used to calculate the average length of entities.
def avg_entity_len(entities_per_sentence):
  lens = []
  for entities in entities_per_sentence:
    for entity in entities:
      len = entity[2]-entity[1]+1
      lens.append(len)
  return np.mean(lens)

# Function used to calculate the average length of sentences.
def avg_sent_len(sents):
  lens = [len(sent.splitlines()) for sent in sents]
  return np.mean(lens)

In [None]:
# Functions taken from the seqeval library, to obtain the dataset's entities in the form of tuples and with the IOB2 format.
def get_entities(seq, suffix=False):
    """Gets entities from sequence.
    Args:
        seq (list): sequence of labels.
    Returns:
        list: list of (chunk_type, chunk_start, chunk_end).
    Example:
        >>> from seqeval.metrics.sequence_labeling import get_entities
        >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
        >>> get_entities(seq)
        [('PER', 0, 1), ('LOC', 3, 3)]
    """

    def _validate_chunk(chunk, suffix):
        if chunk in ['O', 'B', 'I', 'E', 'S']:
            return

        if suffix:
            if not chunk.endswith(('-B', '-I', '-E', '-S')):
                warnings.warn('{} seems not to be NE tag.'.format(chunk))

        else:
            if not chunk.startswith(('B-', 'I-', 'E-', 'S-')):
                warnings.warn('{} seems not to be NE tag.'.format(chunk))

    # for nested list
    if any(isinstance(s, list) for s in seq):
        seq = [item for sublist in seq for item in sublist + ['O']]

    prev_tag = 'O'
    prev_type = ''
    begin_offset = 0
    chunks = []
    for i, chunk in enumerate(seq + ['O']):
        _validate_chunk(chunk, suffix)

        if suffix:
            tag = chunk[-1]
            type_ = chunk[:-1].rsplit('-', maxsplit=1)[0] or '_'
        else:
            tag = chunk[0]
            type_ = chunk[1:].split('-', maxsplit=1)[-1] or '_'

        if end_of_chunk(prev_tag, tag, prev_type, type_):
            chunks.append((prev_type, begin_offset, i - 1))
        if start_of_chunk(prev_tag, tag, prev_type, type_):
            begin_offset = i
        prev_tag = tag
        prev_type = type_

    return chunks
def end_of_chunk(prev_tag, tag, prev_type, type_):
    """Checks if a chunk ended between the previous and current word.
    Args:
        prev_tag: previous chunk tag.
        tag: current chunk tag.
        prev_type: previous type.
        type_: current type.
    Returns:
        chunk_end: boolean.
    """
    chunk_end = False

    if prev_tag == 'E':
        chunk_end = True
    if prev_tag == 'S':
        chunk_end = True

    if prev_tag == 'B' and tag == 'B':
        chunk_end = True
    if prev_tag == 'B' and tag == 'S':
        chunk_end = True
    if prev_tag == 'B' and tag == 'O':
        chunk_end = True
    if prev_tag == 'I' and tag == 'B':
        chunk_end = True
    if prev_tag == 'I' and tag == 'S':
        chunk_end = True
    if prev_tag == 'I' and tag == 'O':
        chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    return chunk_end

def start_of_chunk(prev_tag, tag, prev_type, type_):
    """Checks if a chunk started between the previous and current word.
    Args:
        prev_tag: previous chunk tag.
        tag: current chunk tag.
        prev_type: previous type.
        type_: current type.
    Returns:
        chunk_start: boolean.
    """
    chunk_start = False

    if tag == 'B':
        chunk_start = True
    if tag == 'S':
        chunk_start = True

    if prev_tag == 'E' and tag == 'E':
        chunk_start = True
    if prev_tag == 'E' and tag == 'I':
        chunk_start = True
    if prev_tag == 'S' and tag == 'E':
        chunk_start = True
    if prev_tag == 'S' and tag == 'I':
        chunk_start = True
    if prev_tag == 'O' and tag == 'E':
        chunk_start = True
    if prev_tag == 'O' and tag == 'I':
        chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    return chunk_start

In [None]:
# Function used to obtain the entities from the lists of sentences.
def get_entities_from_multiconll(sents):
  entities = []
  for sent in sents:
    entities_per_level = defaultdict(list)
    for line in sent.splitlines():
      for i, v in enumerate(line.split()[1:]):
        entities_per_level[i].append(v)
    
    sent_entities = []
    for k, v in entities_per_level.items():
      sent_entities.extend(get_entities(v))
    entities.append(sent_entities)
  return entities

In [None]:
# Function used to obtain the nested entities from a list of entities.
def get_nested_entities(entities_per_sentence):
  nested_entities_per_sentence = []
  for entities in entities_per_sentence:
    nested_entities = []
    for e1 in entities:
      for e2 in entities:
        if e1!=e2:
          s_e1 = e1[1]
          e_e1 = e1[2]
          s_e2 = e2[1]
          e_e2 = e2[2]
          if (s_e1<=s_e2 and e_e2<=e_e1):
            if e1 not in nested_entities:
              nested_entities.append(e1)
            if e2 not in nested_entities:
              nested_entities.append(e2)
    nested_entities_per_sentence.append(nested_entities)
  return nested_entities_per_sentence

In [None]:
# Function used to obtain complete nestings (internal and external entities).
def get_nestings(entities):
  nestings = [] 
  total = []

  for e1 in entities:
    is_outer = True 
    possible_nested_entity = [e1]
    
    for e2 in entities:
      if e1!=e2:
        s_e1 = e1[1]
        e_e1 = e1[2]
        s_e2 = e2[1]
        e_e2 = e2[2]
        if ((s_e1>s_e2 and e_e1<e_e2) or (s_e1==s_e2 and e_e1<e_e2) or (s_e1>s_e2 and e_e1==e_e2)):
          is_outer = False 
        if (s_e2>=s_e1 and e_e2<=e_e1):
          if e1 not in total:
            total.append(e1)
          if e2 not in total:
            total.append(e2)
          possible_nested_entity.append(e2)
    
    if len(possible_nested_entity)==1:
      is_outer = False
    
    if is_outer:
      possible_nested_entity.sort(key=lambda x: (x[2]-x[1], x[0]), reverse=True)
      if possible_nested_entity not in nestings:
        nestings.append(possible_nested_entity)
  return nestings, total

In [None]:
# Functions used to obtain nestings of different types.
def is_multilabel_entity(nesting):
  for entity in nesting:
    if entity[1]!=nesting[0][1] or entity[2]!=nesting[0][2]:
      return False
  return True

def different_nesting_type(true_labels):
  support = 0
  entities_support = 0
  for sent_test_labels in true_labels:
    already_added = []
    test_nestings, _ = get_nestings(sent_test_labels)
    dnt_test = []
    for nesting in test_nestings:
      if not is_multilabel_entity(nesting):
        outer = nesting[0]
        dnt = [outer]
        for inner in nesting[1:]:
          if inner[0]!=outer[0]:
            dnt.append(inner)
        if len(dnt)>1: dnt_test.append(dnt)
    support+=len(dnt_test)
    for nest in dnt_test:
      for entity in nest:
        if entity not in already_added:
          entities_support+=1
          already_added.append(entity)
  return support, entities_support

In [None]:
# Functions used to obtain nestings of the same type.
def same_nesting_type(labels):

  support = 0
  entities_support = 0
  
  for true_labels in labels:
    test_nestings, tt = get_nestings(true_labels)
    already_added = []
    snt_test = []
    for nesting in test_nestings:
      outer = nesting[0]
      stn = [outer]
      for inner in nesting[1:]:

        if inner[0]==outer[0]:
          stn.append(inner)

      if len(stn)>1: snt_test.append(stn)
    
 
    for nest in snt_test:
      for entity in nest:
        if entity not in already_added:
          entities_support+=1
          already_added.append(entity)
    support+=len(snt_test)

  
  return support, entities_support

In [None]:
# Function used to obtain multilabel entities.
def multilabel(labels_per_sentence):
  nesting_support = 0
  entities_support = 0
  for true_labels in labels_per_sentence:
    already_added = []
    test_nestings, tt = get_nestings(true_labels)
    test_multilabel_entities = defaultdict(list)
    for nesting in test_nestings:
      for entity in nesting:
        test_multilabel_entities[(entity[1], entity[2])].append(entity[0])
    
    for k, v in test_multilabel_entities.items():
      if len(v)>1:
        nesting_support+=1
        for val in v:
          if (val, k[0], k[1]) not in already_added:
            entities_support+=1
            already_added.append((val, k[0], k[1]))

  return nesting_support, entities_support

In [None]:
# Function used to count the entities.
def count_entities(entities_per_sentence):
  support = 0
  for sent in entities_per_sentence:
    support+=len(sent)
  return support

# **Statistics.**

**First, remember to upload the files that are in the zip preprocessed-files.**

**GENIA Statistics.**

In [None]:
genia_train_sentences = read_file('genia.train.iob2')[:-1]
genia_train_sent_avg = avg_sent_len(genia_train_sentences)
genia_train_tokens = get_tokens(genia_train_sentences)
genia_train_entities = get_entities_from_multiconll(genia_train_sentences)
genia_train_avg = avg_entity_len(genia_train_entities)
genia_train_nested_entities = count_entities(get_nested_entities(genia_train_entities))
print(f'GENIA train sentences: {len(genia_train_sentences)}')
print(f'GENIA train tokens: {len(genia_train_tokens)}')
print(f'GENIA train entities: {count_entities((genia_train_entities))}')
print(f'GENIA train sentences avg len: {genia_train_sent_avg}')
print(f'GENIA train entities avg len: {genia_train_avg}')
print(f'GENIA train nested entities: {genia_train_nested_entities}')
print(f'GENIA train nesting different type: {different_nesting_type(genia_train_entities)[1]}')
print(f'GENIA train nesting same type: {same_nesting_type(genia_train_entities)[1]}')
print(f'GENIA train multilabel entities: {multilabel(genia_train_entities)[1]}')

In [None]:
genia_test_sentences = read_file('genia.test.iob2')[:-1]
genia_test_sent_avg = avg_sent_len(genia_test_sentences)
genia_test_tokens = get_tokens(genia_test_sentences)
genia_test_entities = get_entities_from_multiconll(genia_test_sentences)
genia_test_avg = avg_entity_len(genia_test_entities)
genia_test_nested_entities = count_entities(get_nested_entities(genia_test_entities))
print(f'GENIA test sentences: {len(genia_test_sentences)}')
print(f'GENIA test tokens: {len(genia_test_tokens)}')
print(f'GENIA test entities: {count_entities((genia_test_entities))}')
print(f'GENIA test sentences avg len: {genia_test_sent_avg}')
print(f'GENIA test entities avg len: {genia_test_avg}')
print(f'GENIA test nested entities: {genia_test_nested_entities}')
print(f'GENIA test nesting different type: {different_nesting_type(genia_test_entities)[1]}')
print(f'GENIA test nesting same type: {same_nesting_type(genia_test_entities)[1]}')
print(f'GENIA test multilabel entities: {multilabel(genia_test_entities)[1]}')

In [None]:
genia_dev_sentences = read_file('genia.dev.iob2')[:-1]
genia_dev_sent_avg = avg_sent_len(genia_dev_sentences)
genia_dev_tokens = get_tokens(genia_dev_sentences)
genia_dev_entities = get_entities_from_multiconll(genia_dev_sentences)
genia_dev_avg = avg_entity_len(genia_dev_entities)
genia_dev_nested_entities = count_entities(get_nested_entities(genia_dev_entities))
print(f'GENIA dev sentences: {len(genia_dev_sentences)}')
print(f'GENIA dev tokens: {len(genia_dev_tokens)}')
print(f'GENIA dev entities: {count_entities((genia_dev_entities))}')
print(f'GENIA dev sentences avg len: {genia_dev_sent_avg}')
print(f'GENIA dev entities avg len: {genia_dev_avg}')
print(f'GENIA dev nested entities: {genia_dev_nested_entities}')
print(f'GENIA dev nesting different type: {different_nesting_type(genia_dev_entities)[1]}')
print(f'GENIA dev nesting same type: {same_nesting_type(genia_dev_entities)[1]}')
print(f'GENIA dev multilabel entities: {multilabel(genia_dev_entities)[1]}')

**GERMEVAL Statistics.**

In [None]:
germ_train_sentences = read_file('germ.train.iob2')[:-1]
germ_train_sent_avg = avg_sent_len(germ_train_sentences)
germ_train_tokens = get_tokens(germ_train_sentences)
germ_train_entities = get_entities_from_multiconll(germ_train_sentences)
germ_train_avg = avg_entity_len(germ_train_entities)
germ_train_nested_entities = count_entities(get_nested_entities(germ_train_entities))
print(f'GERM train sentences: {len(germ_train_sentences)}')
print(f'GERM train tokens: {len(germ_train_tokens)}')
print(f'GERM train entities: {count_entities(germ_train_entities)}')
print(f'GERM train sentences avg len: {germ_train_sent_avg}')
print(f'GERM train entities avg len: {germ_train_avg}')
print(f'GERM train nested entities: {germ_train_nested_entities}')
print(f'GERM train nesting different type: {different_nesting_type(germ_train_entities)[1]}')
print(f'GERM train nesting same type: {same_nesting_type(germ_train_entities)[1]}')
print(f'GERM train multilabel entities: {multilabel(germ_train_entities)[1]}')

In [None]:
germ_test_sentences = read_file('germ.test.iob2')[:-1]
germ_test_sent_avg = avg_sent_len(germ_test_sentences)
germ_test_tokens = get_tokens(germ_test_sentences)
germ_test_entities = get_entities_from_multiconll(germ_test_sentences)
germ_test_avg = avg_entity_len(germ_test_entities)
germ_test_nested_entities = count_entities(get_nested_entities(germ_test_entities))
print(f'GERM test sentences: {len(germ_test_sentences)}')
print(f'GERM test tokens: {len(germ_test_tokens)}')
print(f'GERM test entities: {count_entities(germ_test_entities)}')
print(f'GERM test sentences avg len: {germ_test_sent_avg}')
print(f'GERM test entities avg len: {germ_test_avg}')
print(f'GERM test nested entities: {germ_test_nested_entities}')
print(f'GERM test nesting different type: {different_nesting_type(germ_test_entities)[1]}')
print(f'GERM test nesting same type: {same_nesting_type(germ_test_entities)[1]}')
print(f'GERM test multilabel entities: {multilabel(germ_test_entities)[1]}')

In [None]:
germ_dev_sentences = read_file('germ.dev.iob2')[:-1]
germ_dev_sent_avg = avg_sent_len(germ_dev_sentences)
germ_dev_tokens = get_tokens(germ_dev_sentences)
germ_dev_entities = get_entities_from_multiconll(germ_dev_sentences)
germ_dev_avg = avg_entity_len(germ_dev_entities)
germ_dev_nested_entities = count_entities(get_nested_entities(germ_dev_entities))
print(f'GERM dev sentences: {len(germ_dev_sentences)}')
print(f'GERM dev tokens: {len(germ_dev_tokens)}')
print(f'GERM dev entities: {count_entities(germ_dev_entities)}')
print(f'GERM dev sentences avg len: {germ_dev_sent_avg}')
print(f'GERM dev entities avg len: {germ_dev_avg}')
print(f'GERM dev nested entities: {germ_dev_nested_entities}')
print(f'GERM dev nesting different type: {different_nesting_type(germ_dev_entities)[1]}')
print(f'GERM dev nesting same type: {same_nesting_type(germ_dev_entities)[1]}')
print(f'GERM dev multilabel entities: {multilabel(germ_dev_entities)[1]}')

**Chilean Waiting List Statistics.**

In [None]:
wl_train_sentences = read_file('wl.train.iob2')
wl_train_sent_avg = avg_sent_len(wl_train_sentences)
wl_train_tokens = get_tokens(wl_train_sentences)
wl_train_entities = get_entities_from_multiconll(wl_train_sentences)
wl_train_avg = avg_entity_len(wl_train_entities)
wl_train_nested_entities = count_entities(get_nested_entities(wl_train_entities))
print(f'WL train sentences: {len(wl_train_sentences)}')
print(f'WL train tokens: {len(wl_train_tokens)}')
print(f'WL train entities: {count_entities(wl_train_entities)}')
print(f'WL train sentences avg len: {wl_train_sent_avg}')
print(f'WL train entities avg len: {wl_train_avg}')
print(f'WL train nested entities: {wl_train_nested_entities}')
print(f'WL train nesting different type: {different_nesting_type(wl_train_entities)[1]}')
print(f'WL train nesting same type: {same_nesting_type(wl_train_entities)[1]}')
print(f'WL train multilabel entities: {multilabel(wl_train_entities)[1]}')

In [None]:
wl_test_sentences = read_file('wl.test.iob2')
wl_test_sent_avg = avg_sent_len(wl_test_sentences)
wl_test_tokens = get_tokens(wl_test_sentences)
wl_test_entities = get_entities_from_multiconll(wl_test_sentences)
wl_test_avg = avg_entity_len(wl_test_entities)
wl_test_nested_entities = count_entities(get_nested_entities(wl_test_entities))
print(f'WL test sentences: {len(wl_test_sentences)}')
print(f'WL test tokens: {len(wl_test_tokens)}')
print(f'WL test entities: {count_entities(wl_test_entities)}')
print(f'WL test sentences avg len: {wl_test_sent_avg}')
print(f'WL test entities avg len: {wl_test_avg}')
print(f'WL test nested entities: {wl_test_nested_entities}')
print(f'WL test nesting different type: {different_nesting_type(wl_test_entities)[1]}')
print(f'WL test nesting same type: {same_nesting_type(wl_test_entities)[1]}')
print(f'WL test multilabel entities: {multilabel(wl_test_entities)[1]}')

In [None]:
wl_dev_sentences = read_file('wl.dev.iob2')
wl_dev_sent_avg = avg_sent_len(wl_dev_sentences)
wl_dev_tokens = get_tokens(wl_dev_sentences)
wl_dev_entities = get_entities_from_multiconll(wl_dev_sentences)
wl_dev_avg = avg_entity_len(wl_dev_entities)
wl_dev_nested_entities = count_entities(get_nested_entities(wl_dev_entities)) 
print(f'WL dev sentences: {len(wl_dev_sentences)}')
print(f'WL dev tokens: {len(wl_dev_tokens)}')
print(f'WL dev entities: {count_entities(wl_dev_entities)}')
print(f'WL dev sentences avg len: {wl_dev_sent_avg}')
print(f'WL dev entities avg len: {wl_dev_avg}')
print(f'WL dev nested entities: {wl_dev_nested_entities}')
print(f'WL dev nesting different type: {different_nesting_type(wl_dev_entities)[1]}')
print(f'WL dev nesting same type: {same_nesting_type(wl_dev_entities)[1]}')
print(f'WL dev multilabel entities: {multilabel(wl_dev_entities)[1]}')