In [1]:
from transformers import RobertaForTokenClassification, AutoTokenizer, pipeline
import re

TOKENIZATION_REGEX = re.compile(
    r'([0-9A-Za-zÀ-ÖØ-öø-ÿ]+|[^0-9A-Za-zÀ-ÖØ-öø-ÿ])')

MODEL_PATH = "../best-l2kx7y5e/"
model = RobertaForTokenClassification.from_pretrained(MODEL_PATH)

In [2]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [3]:
sentence = "El paciente tiene fiebre (39.5), además de: tos, mocos y malestar general"

In [4]:
tokens = [t for t in TOKENIZATION_REGEX.split(sentence) if t and not t.isspace()]

In [5]:
pipe = pipeline("token-classification", model=MODEL_PATH, aggregation_strategy='first')

In [6]:
sentence_pretokenized = ' '.join(tokens)
sentence_pretokenized

'El paciente tiene fiebre ( 39 . 5 ) , además de : tos , mocos y malestar general'

In [7]:
def get_added_spaces(sentence, sentence_pretokenized):
    i = j = 0
    added_spaces = []
    while j < len(sentence_pretokenized):
        if sentence[i] == sentence_pretokenized[j]:
            i += 1
            j += 1
        elif sentence[i] == sentence_pretokenized[j+1] and sentence_pretokenized[j] == ' ':
            added_spaces.append(j)
            j += 1
        else:
            raise AssertionError("This should never be called.")
    return added_spaces

In [8]:
added_spaces = get_added_spaces(sentence, sentence_pretokenized)

In [9]:
results_pre = pipe(sentence_pretokenized)
results_pre



[{'entity_group': 'SINTOMA',
  'score': 0.9999448,
  'word': ' fiebre',
  'start': 18,
  'end': 24},
 {'entity_group': 'SINTOMA',
  'score': 0.9999219,
  'word': ' tos',
  'start': 50,
  'end': 53},
 {'entity_group': 'SINTOMA',
  'score': 0.9998416,
  'word': ' mocos',
  'start': 56,
  'end': 61},
 {'entity_group': 'SINTOMA',
  'score': 0.99991643,
  'word': ' malestar general',
  'start': 64,
  'end': 80}]

In [10]:
def align_results(results_pre, added_spaces):
    aligned_results = []
    for entity in results_pre:
        aligned_entity = entity.copy()
        num_added_spaces = len(list(filter(lambda offset: offset < entity['start'], added_spaces)))
        aligned_entity['word'] = entity['word'].strip()
        aligned_entity['start'] = entity['start'] - num_added_spaces
        aligned_entity['end'] = entity['end'] - num_added_spaces
        aligned_results.append(aligned_entity)
    return aligned_results

In [11]:
aligned_results = align_results(results_pre, added_spaces)
aligned_results

[{'entity_group': 'SINTOMA',
  'score': 0.9999448,
  'word': 'fiebre',
  'start': 18,
  'end': 24},
 {'entity_group': 'SINTOMA',
  'score': 0.9999219,
  'word': 'tos',
  'start': 44,
  'end': 47},
 {'entity_group': 'SINTOMA',
  'score': 0.9998416,
  'word': 'mocos',
  'start': 49,
  'end': 54},
 {'entity_group': 'SINTOMA',
  'score': 0.99991643,
  'word': 'malestar general',
  'start': 57,
  'end': 73}]