# **Rule-Based Algorithm**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- Pere Mayol Carbonell NIU:1669503
- 
- 

**Data loading**

In [47]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import string
from unidecode import unidecode

### **Pre-processing**
- Removing redacted entries
- Removing language mixes
- Solving misspelled words
- Removing patient information
- Tokenizing 

In [48]:
# Load language models for Spanish and Catalan
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

def remove_patient_info(tokens):
    # Define patterns to match patient information identifiers
    patterns = [
        r'nº\s?historia\s?clinica:',
        r'nºepisodi:',
        r'sexe:',
        r'data\s?de\s?naixement:',
        r'edat:',
        r'procedencia',
        r'servei\s?obstetricia',
        r'data\sd\'ingres',
        r'data\sd\'alta',
        r'ates\s?per',
        r'informe\sd\'alta\sd\'hospitalitzacio',
        r'motiu\sd\'ingres',
        r'nhc',
        r'lopd'
    ]
    # Join tokens into text for efficient pattern matching
    text = ' '.join([token[0] for token in tokens])
    # Remove tokens containing patient information identifiers
    for pattern in patterns:
        text = re.sub(pattern + '.*?(?=\\s|$)', '', text, flags=re.IGNORECASE)
    # Tokenize the modified text
    return tokenize_with_coordinates(text)



# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(tokens):
    # Remove punctuation from token text
    tokens_without_punct = [(token[0].translate(str.maketrans('', '', string.punctuation)), token[1], token[2]) for token in tokens]
    return tokens_without_punct

# 3 Spell checking with language detection (Able to be turned ON/OFF)
def spell_check_and_lemmatize(tokens):
    # Join tokens into text
    text = ' '.join([token[0] for token in tokens])
    # Detect the language of the text
    language = detect(text)
    # Initialize spell checker
    spell = SpellChecker(language='es')  # As most of the text is in Spanish

    # Tokenize the text using the appropriate language model
    if language == 'ca':
        doc = nlp_ca(text)
    else:
        doc = nlp_es(text)

    # Correct misspelled words and lemmatize tokens
    corrected_tokens = []
    for token in doc:
        # Check if the token is a punctuation or whitespace
        if not token.is_punct and not token.is_space:
            # Get the corrected version of the token
            corrected_token = spell.correction(token.text)
            # Lemmatize the corrected token
            corrected_token_lemma = token.lemma_ if token.lemma_ != '-PRON-' else corrected_token
            corrected_tokens.append((corrected_token_lemma, token.idx, token.idx + len(corrected_token_lemma)))
        else:
            corrected_tokens.append((token.text, token.idx, token.idx + len(token.text)))

    return corrected_tokens


def tokenize_with_coordinates(text):
    # Tokenize the text using spaCy
    doc = nlp_es(text)

    # Extract tokens with their start and end positions
    tokens_with_coordinates = [(token.text, token.idx, token.idx + len(token.text)) for token in doc if not token.is_space]

    return tokens_with_coordinates


# Remove empty tokens   
def remove_empty_tokens(tokens):
    # Filter out tokens with empty text
    non_empty_tokens = [(token[0], token[1], token[2]) for token in tokens if token[0]]
    return non_empty_tokens


# Main function to process the text
def pre_process_text(text, remove_punctuation_call=True, spell_check_call=True):
    # Tokenize the text with coordinates
    tokens_with_coordinates = tokenize_with_coordinates(text)

    # Apply each processing step to the tokens
    if remove_punctuation_call:
        tokens_with_coordinates = remove_punctuation(tokens_with_coordinates)

    if spell_check_call:
        tokens_with_coordinates = spell_check_and_lemmatize(tokens_with_coordinates)

    # Remove empty tokens
    tokens_with_coordinates = remove_empty_tokens(tokens_with_coordinates)

    # Remove extra patient information
    tokens_with_coordinates = tokens_with_coordinates[32:]  # nº-motiu
    tokens_with_coordinates = tokens_with_coordinates[:-5]  # nhc-lopd

    # Remove accent marks from tokens
    tokens_with_coordinates = [(unidecode(token[0]), token[1], token[2]) for token in tokens_with_coordinates]

    # Return the modified tokens
    return tokens_with_coordinates

Real Usage

In [49]:
# Load the JSON file
with open("negacio_test_v2024.json", "r", encoding="utf-8") as file:
    training_data = json.load(file)

# List to store processed texts
processed_texts = []

# Iterate over each entry in the training data
for entry in training_data:
    text = entry["data"]["text"]  # Extract the text from the JSON object
    processed_text = pre_process_text(text, remove_punctuation_call=True, spell_check_call=False)
    processed_texts.append(processed_text)

# Printing (not necessary)
for processed_text in processed_texts:
    for token, start, end in processed_text:
        print(f"Token: {token}, Start: {start}, End: {end}")

Token: al, Start: 329, End: 331
Token: parto, Start: 332, End: 337
Token: por, Start: 338, End: 341
Token: pequeno, Start: 342, End: 349
Token: para, Start: 350, End: 354
Token: la, Start: 355, End: 357
Token: edad, Start: 358, End: 362
Token: gestacional, Start: 363, End: 374
Token: peg, Start: 377, End: 380
Token: antecedents, Start: 383, End: 394
Token: no, Start: 395, End: 397
Token: alergias, Start: 398, End: 406
Token: medicamentosas, Start: 407, End: 421
Token: conocidas, Start: 422, End: 431
Token: antcededentes, Start: 432, End: 445
Token: medicoquirurgicos, Start: 446, End: 464
Token: protesis, Start: 466, End: 474
Token: mamaria, Start: 475, End: 482
Token: adenoidectomia, Start: 484, End: 498
Token: niega, Start: 499, End: 504
Token: habitos, Start: 505, End: 512
Token: toxicos, Start: 513, End: 520
Token: medicacio, Start: 521, End: 530
Token: habitual, Start: 531, End: 539
Token: anafranil25, Start: 540, End: 551
Token: mg, Start: 552, End: 555
Token: diario, Start: 556, 

**Pre-processing json object**

- Reading the json object to obtain a list of the Negation and Uncertainty proc words as well as their scopes

- The scopes and annotations will be used for the supervised learning as the 'target'

In [50]:
# Loading the json object
loading = open("negacio_test_v2024.json", encoding="utf-8")
for_object = loading.read()
object = json.loads(for_object)

In [51]:
# Prepare lists to store the results
negations = []
negation_scopes = []
uncertainties = []
uncertainty_scopes = []

# Process the documents and obtain negation and uncertainty annotations
for item in object:
    text_data = item['data']['text']
    for prediction in item['predictions']:
        for result in prediction['result']:
            labels = result['value']['labels']
            start_index = result['value']['start']
            end_index = result['value']['end']
            text_segment = item['data']['text'][start_index:end_index]

            if "NEG" in labels:
                negations.append(text_segment)

                # Find the scope of the negation
                scope_start = -1
                scope_end = -1

                # Search for the scope starting just after the negation
                current_index = prediction['result'].index(result)
                if current_index < len(prediction['result']) - 1:
                    next_result = prediction['result'][current_index + 1]
                    next_start = next_result['value']['start']
                    next_end = next_result['value']['end']
                    next_labels = next_result['value']['labels']
                    if "NSCO" in next_labels and next_start == end_index:
                        scope_start = next_start
                        scope_end = next_end

                current_index = prediction['result'].index(result)
                if current_index > 0:
                    prev_result = prediction['result'][current_index - 1]
                    prev_start = prev_result['value']['start']
                    prev_end = prev_result['value']['end']
                    prev_labels = prev_result['value']['labels']
                    if "NSCO" in prev_labels and prev_end == start_index:
                        scope_start = prev_start
                        scope_end = prev_end

                if scope_start != -1 and scope_end != -1:
                    scope_text = item['data']['text'][scope_start:scope_end]
                    negation_scopes.append((start_index, scope_end))

            if "UNC" in labels:
                uncertainties.append(text_segment)

                # Find the scope of the uncertainty
                scope_start = -1
                scope_end = -1

                # Search for the scope starting just after the uncertainty
                current_index = prediction['result'].index(result)
                if current_index < len(prediction['result']) - 1:
                    next_result = prediction['result'][current_index + 1]
                    next_start = next_result['value']['start']
                    next_end = next_result['value']['end']
                    next_labels = next_result['value']['labels']
                    if "USCO" in next_labels and next_start == end_index:
                        scope_start = next_start
                        scope_end = next_end

                current_index = prediction['result'].index(result)
                if current_index > 0:
                    prev_result = prediction['result'][current_index - 1]
                    prev_start = prev_result['value']['start']
                    prev_end = prev_result['value']['end']
                    prev_labels = prev_result['value']['labels']
                    if "USCO" in prev_labels and prev_end == start_index:
                        scope_start = prev_start
                        scope_end = prev_end

                if scope_start != -1 and scope_end != -1:
                    scope_text = item['data']['text'][scope_start:scope_end]
                    uncertainty_scopes.append((start_index, scope_end))

# Remove whitespaces from negations and uncertainties if they are at the end
negations = [negation.rstrip() for negation in negations]
uncertainties = [uncertainty.rstrip() for uncertainty in uncertainties]
# Remove duplicates from the annotation lists
negations = list(set(negations))
negation_scopes = list(set(negation_scopes))
uncertainties = list(set(uncertainties))
uncertainty_scopes = list(set(uncertainty_scopes))



# Print the negation/uncertainty annotations and their scopes
print("Negations\n", negations)
print("Negation Scopes\n", negation_scopes)
print("Uncertainties\n", uncertainties)
print("Uncertainty Scopes\n", uncertainty_scopes)

Negations
 ['negativos', 'negativa', 'ex-fumador', 'nega', ' afebril', 'negatiu', 'afebril.', 'afebril,', 'descartan', 'asintomatica', 'insuficiencia', 'asintomatico', 'desorientada', 'ni', 'ausente,', 'desorientada,', 'neg', 'falta de', 'incapacidad', 'no', 'retirar', 'nula', 'sense', 'negativo', 'exfumador', 'asintomatico.', 'ausencia de', 'sin', 'tampoco', 'niega', 'imposibilidad de', 'negativas', 'inespecifico,', 'descarta', 'neg.', 'retirado.', 'niegan', 'inestabilidad', 'ex', 'inespecifica.', 'afebril', 'negativa.', 'desconocen', 'negativos.', 'inespecifico.', 'cede', 'retiro']
Negation Scopes
 [(926, 952), (2207, 2230), (1218, 1218), (11293, 11360), (1798, 1811), (2559, 2574), (1731, 1748), (3840, 3852), (2211, 2223), (432, 439), (874, 882), (3997, 4028), (453, 489), (3393, 3406), (4654, 4697), (3555, 3575), (5537, 5566), (2054, 2132), (3684, 3707), (2498, 2517), (5275, 5309), (377, 377), (829, 861), (1689, 1737), (401, 438), (762, 807), (981, 1022), (2712, 2738), (3643, 3656), 

**Implementation**
____________________________________________________________

Medical Words

In [52]:
#MEDICAL WORDS FROM TRAINING SET + MANUALY ADDED
med_words = ['secuelas', 'positivo','positiva','cuadro','patron', 'terapeutico', 'mejora', 'terapia', 'alteracion', 'reaccion', 'farmacoterapias', 'malestares', 'anormalidades', 'indicacion', 'desordenes', 'lesiones', 'farmacoterapia', 'respuesta', 'sindromes', 'desviaciones', 'sensibilidad', 'diagnostico', 'tratamiento', 'dolor', 'examenes', 'molestias', 'farmacoterapeutico', 'deterioros', 'enfermedad', 'exposicion', 'resultado', 'riesgo', 'sintomas', 'deficiencia', 'efecto', 'toxicos', 'eficacia', 'toxicidad', 'presencia', 'inmunidad', 'tratamientos', 'discapacidades', 'danos', 'intolerancia', 'deformidades', 'prevencion', 'afecciones', 'rechazo', 'lesion', 'examen', 'farmacoterapeuticos', 'farmacos', 'patologias', 'dolencias', 'diagnosticos', 'prueba', 'analisis', 'complicaciones', 'patologia', 'anomalias', 'infecciones', 'disfunciones', 'padecimientos', 'progresion', 'agravamientos', 'infeccion', 'efectos', 'nivel', 'sintoma', 'condicion', 'trastornos', 'pruebas', 'farmacologico', 'sindrome', 'concentracion', 'capacidad', 'hallazgo', 'secuela', 'afectaciones', 'inflamacion', 'manifestaciones', 'deteccion', 'enfermedades', 'dolores', 'signo', 'funcion', 'complicacion', 'adversidades', 'resistencia', 'problemas', 'absorcion']

Negation Words / Phrases

In [53]:
#EXTRACTED FROM THE TRAINING SET
neg_pre = ['impide', 'exfumador', 'negativo', 'desaparecen', 'asintomatica', 'afebril', 'se desestimo', 'negativos', 'ninguno', 'asintomatico', 'desorientado', 'inestabilidad', 'atipicos', 'ausencia de', 'ceden', 'negativa', 'negativas', 'excepto', 'desorientacion', 'inespecificos', 'se suspende','inespecifico', 'ex', 'arritmicos', 'cede', 'se retira', 'ex fumador', 'niegan', 'negatiu', 'negaitvo', 'indetectable',' negativo', 'suspendido']
neg_pos = ['retirar','ni' 'desaparicion de', 'descarta', 'ausencia', 'descartada', 'niega', 'nega', 'rechaza', 'desaparicion del', 'imposibilidad', 'retiro', 'irregulares', 'negatividad', 'tampoco', 'sin', 'imposibilidad de', 'en ninguna', 'incapacidad para', ' no', 'neg', ' afebril', 'sense', 'falta de', 'negatividad de', 'negatividad del', 'no']

In [54]:
#EXTRACTED FROM THE GITHUB
neg_pre_filtered= ['gobierna al paciente', 'ninguna otra evidencia', 'la van descartar per', 'no aparece', 'descartaron al paciente por', 'adecuado para descartarla', 'excluir', 'excloure', 'descartar al pacient per', 'sin ninguna evidencia de', 'descartarlo por', 'no tenía', 'lo descartó', 'sin signo de', 'sense indicació de', 'pot descartar', 'excluye', 'ho descarta', 'ninguna evidencia radiográfica de', 'descartarlo', 'negando', 'sense troballes de', 'adequat per a descartar-lo', 'cap senyal de', 'no em queixo de', 'ninguna señal de', 'nunca tuve', 'ho va descartar', 'descartar', 'lo descartaron por', 'la va descartar contra', 'pot descartar-ho', 'lo descartaron en contra', 'libre de', 'pot descartar al pacient', 'no tinc', 'absència de', 'sin quejas de', 'van descartar contra', 'la va descartar', 'no tenia', 'descartado contra', 'ho va descartar contra', 'no me quejo de', 'puede descartar', 'no significativo', 'resuelto', 'puede descartar al paciente', 'no sospitós', 'gens especial per a', 'pot descartar-ho per', 'exclou', 'r / o', 'resolt', 'van descartar per a', 'descartó', 'va descartar', 'puede descartarla contra', 'sin evidencia', 'descartar per a', 'no apareix', 'pot descartar-la per', 'la descartó', 'va descartar al pacient', 'descartar-ho', 'cap altra evidència', 'no saber de', 'ninguna nueva evidencia', 'sin indicación de', 'mai desenvolupat', 'sense queixes de', "l'exclou", 'lo descarta', 'cap suggeriment de', 'cap evidència radiogràfica de', 'sense cap evidència de', 'puede descartarlo', 'pacient no era', 'no apreciar', 'con ningún', 'cap causa de', 'adecuado para descartarlo', 'no associat amb', 'descartarla', 'nunca desarrollado', 'adecuado para descartarla por', 'no poden veure', 'ro', 'ninguna evidencia para sugerir', 'sin hallazgos de', 'no pueden ver', 'pot descartar-la', 'la excluye', 'governa al pacient', 'amb cap', 'descartar al paciente por', 'expulsó al paciente por', 'no anormal', 'no sospechoso', 'pot descartar-la contra', 'nada nuevo', 'paciente no era', 'descartaron para', 'no poder', 'evaluar por', 'suficiente para descartarlo por', 'no exhibir', 'més aviat que', 'fer una prova per', 'puede descartarla por', 'ninguna causa de', 'lliure de', 'mai vaig tenir', 'no tengo', 'descartaron contra', 'puede descartarla', 'adecuado para descartarlo por', 'descartat contra', 'descartar-la', 'adequat per a descartar-la', 'no significatiu', 'negatiu per a', 'sense evidència', 'ho van descartar per', 'descartar al pacient', 'cap nova evidència', 'descartarla por', 'nada especial para', 'ninguna sugerencia de', 'sense signe de', 'no sentir', 'descartar al paciente', 'descartar-ho per', 'puede descartarlo en contra', 'puede descartarlo por', 'más bien que', 'res nou', 'descartó al paciente', 'negant', 'avaluar per', 'no demostrar', 'descartar para', 'no revela', 'no revelar', 'descartó al paciente contra', 'puede descartar contra', 'descartar-la per', 'suficiente para descartar', 'negativo para', 'la descartó contra', 'revisado para', 'suficient per a descartar', 'revisat per a', 'hacer una prueba por', 'pot descartar-ho en contra', 'pot descartar contra', 'la descartaron por', 'descartaron al paciente contra', 'no asociado con', 'suficiente para descartarla por', 'lo descartó contra']
neg_pos_filtered= ['libre', 'podría ser descartado', 'fue descartado', 'rechazado', 'ha de ser descartat', 'lliure', 'rebutjat', 'puede ser descartado', 'ser descartat', 'adecuado para descartar', 'están descartadas', 'podria ser descartat', 'improbable', 'podría ser descartado por', 'debe ser descartado por', 'suficient per a descartar-ho', 'serà descartat per', 'podria descartar-se', 'suficient per a descartar-la', 'va ser descartat', 'han estat descartades', 'ser descartado por', 'declina', 'adequat per a descartar', 'siendo descartado', 'suficiente para descartarla', 'no ver', 'sent descartat', 'està descartat', 'se puede descartar por', 'pot ser descartat', 'ha estat descartat', 'puede ser descartado para', 'ha de descartar-se', 'suficiente para descartarlo', 'es descarta', 'ha sido descartado', 'será descartado', 'ser descartado', 'no veure', 'ser descartat per', 'debe ser descartado', 'serà descartat', 'debe descartarse para', 'debe ser descartado para', 'es pot descartar per', 'negado', 'se descarta', 'lo descartaron', 'no ser', 'se puede descartar', 'está descartado', 'negat', 'podría descartarse', 'es pot descartar', 'ho van descartar', 'podria ser descartat per', 'han sido descartadas', 'será descartado por', 'estan descartades', 'debe descartarse']
neg_pre.extend(neg_pre_filtered)
neg_pos.extend(neg_pos_filtered)

Uncertainty Words / Phrases

In [55]:
#NOSE COM EMFOCAR: SIN, NO, DESCARTAR
unc_pre = [ 'al parecer' , 'vs', 'dudosamente', 'indeterminado', 'sospecha', 'pudieran', 'aparentes', 'dubtos','permite descartar', 'parece', 'atribuida', 'clara', 'no clara', 'desconocido']
unc_pos = ['compatible amb', 'desconoce', 'indiquen', 'sin aparente', 'sugieren', 'ssospechosas de', 'probablemente', 'posible', 'sugestivo de', 'falsa', 'sospechan de', 'posibilidad de', 'sugiriendo', 'orienta', 'sospechosos de', 'sugestivos de', 'se orienta', 'plantea', 'podria', 'puede', 'podrian', 'probables', 'no', 'sugiere', 'parecen', 'sin', 'sospechosa de', 'sugestivas de', 'orientan como', 'dudosa', 'interpreta', 'compatible con', 'valorar', 'dudosos', 'probable', 'poco porque', 'sugieran', 'sin clara', 'no permite descartar', 'se desconoce', 'impresiona de', 'sugestiva de', 'orienta como', 'orientan', 'sin poder descartar', 'no parece', 'sospitosa de', 'sugestivos con', 'impresiona', 'aparentemente', 'sospecha de', 'no es posible descartar', 'compatibles con', 'compatible', 'aparente', 'sugestiva como', 'posiblemente', 'posibles', 'sugiera de', 'descartar', 'dudoso', 'se orientan', 'sospechosas de', 'sin aparentes', 'sin claras']   

In [56]:
#DEMA HO AFEGEIXO
unc_pre_filtered = ['como una causa secundaria para', 'como el origen secundario de', 'secundario a', 'como la causa de', 'como la causa secundaria de', 'como una razón de', 'como una causa secundaria de', 'como una etiología secundaria para', 'excepto', 'aunque', 'como la fuente secundaria para', 'como una etilogía para', 'como la fuente de', 'como el origen secundario para', 'com la font secundària de', 'como una razón secundaria para', 'com una raó secundària per a', 'com la font secundària per a', 'com una etiologia de', 'como una etiología secundaria de', 'secundari a', 'encara que', 'como una razón secundaria de', 'como la razón secundaria de', 'como la fuente secundaria de', 'com la causa secundària de', 'a pesar que', 'como una razón para', 'como la etilogía de', "com l'etiologia de", 'como la razón secundaria para', 'como un origen secundario para', 'com una font secundària per a', 'como una etilogía de', 'como fuente de', "com l'etiologia secundària per a", "com l'origen secundari per a", "com l'origen secundari de", 'però', 'com a causa de', 'com una etiologia secundària per a', 'como la razón de', 'com un origen secundari per a', 'com una raó secundària de', 'no obstant això', 'com una font secundària de', 'com una raó per a', 'pero', 'com la raó secundària de', 'como la etilogía secundaria para', 'a pesar de que', 'sin embargo', "com l'etiologia secundària de", 'com la causa de', 'com una causa secundària per a', 'com la raó de', 'a part de', 'como una fuente secundaria de', 'com una causa secundària de', 'aparte de', 'com a font de', 'como una fuente secundaria para', "com l'origen de", 'com una etiologia per a', 'com la raó secundària per a', 'como el origen de', 'com un origen secundari de', 'com una raó de', 'como un origen secundario de', 'excepte', 'como causa de', 'encara', 'todavía', 'com una etiologia secundària de', 'como la etilogía secundaria de']
unc_post_filtered =['origens de', 'raons de', 'altres possibilitats de', 'font per a', 'desencadenar evento para', 'origen para', 'raons per a', 'fuente para', 'causes de', 'fuentes de', 'causa de', 'etilogia de', 'etilogía para', 'origen per a', 'motivo de', 'fuentes para', 'otras posibilidades de', 'razones de', 'causas de', 'font de', 'razones para', 'etilogía de', 'fonts per a', 'fonts de', 'raó per a', 'razón de', 'orígenes para', 'razón para', 'motiu de', 'desencadenar esdeveniment per a', 'fuente de', 'orígenes de', 'origen de', 'raó de', 'etilogia per a', 'origens per a']
unc_pre.extend(unc_pre_filtered)
unc_pos.extend(unc_post_filtered)

Detect a Phrase

In [57]:
#FUNCTION TO DETECT A WORD / PHRASE MATCH
def phrase_matching(tokenized_text, word_or_phrases):
    text_length = len(tokenized_text)
    # Sort negation phrases by length in descending order
    word_or_phrases = sorted(word_or_phrases, key=len, reverse=True)
    for i in range(text_length):
        for phrase in word_or_phrases:
            phrase_tokens = phrase.split()
            phrase_length = len(phrase_tokens)
            if i + phrase_length <= text_length:  # Check if remaining tokens are enough to match the phrase
                joined_tokens = ' '.join([token[0] for token in tokenized_text[i:i+phrase_length]])
                if ' '.join(joined_tokens.split()) == phrase:  # Remove extra spaces and then check if consecutive tokens match the phrase
                        yield True, phrase, tokenized_text[i][1], tokenized_text[i+phrase_length-1][2]
                
    return False, None, None, None

Negation Detection

In [58]:
def negation_detection(processed_texts, neg_pre, neg_pos, medical_words, i=0):
    NEG = []
    NSCO = []
    for processed_text in processed_texts:
        for result in phrase_matching(processed_text, neg_pre+neg_pos):
            phrase_found, phrase, start_idx, end_idx = result
            if phrase_found:
                # Calculate the start token index
                if phrase in neg_pre:
                    start_token_index = max(next((i for i, token in enumerate(processed_text) if token[1] > start_idx), len(processed_text)) - 1, 0)
                    # Select the previous 5 tokens
                    previous_tokens = [token[0] for token in processed_text[max(start_token_index - 5, 0):start_token_index]]
                    for token in previous_tokens:
                        if token in medical_words:
                            # Append to NEG
                            NEG.append((token, phrase, start_idx, end_idx))
                            # Calculate start and end indices for NSCO
                            start_nsco = processed_text[max(start_token_index - 5, 0)][1]  # Start index of first token in previous_tokens
                            end_nsco = processed_text[start_token_index - 1][2]  # End index of last token in previous_tokens
                            # Append to NSCO
                            NSCO.append((start_nsco, end_nsco))
                    
                if phrase in neg_pos:
                    start_token_index = max(next((i for i, token in enumerate(processed_text) if token[1] > start_idx), len(processed_text)) - 1, 0)
                    # Select the next 5 tokens
                    next_tokens = [token[0] for token in processed_text[start_token_index + 1:min(start_token_index + 6, len(processed_text))]]
                    for token in next_tokens:
                        if token in medical_words:
                            # Append to NEG
                            NEG.append((phrase, start_idx, end_idx))
                            # Calculate start and end indices for NSCO
                            start_nsco = processed_text[start_token_index + 1][1] if start_token_index + 1 < len(processed_text) else processed_text[start_token_index][1]  # Start index of first token in next_tokens
                            end_nsco = processed_text[min(start_token_index + 5, len(processed_text) - 1)][2]  # End index of last token in next_tokens
                            # Append to NSCO
                            NSCO.append((start_nsco, end_nsco))
    return NEG, NSCO
            

neg_detect, neg_scope_detect = negation_detection(processed_texts, neg_pre, neg_pos, med_words)
print('Negations detected: \n',neg_detect)
print('Negations scopes detected: \n',neg_scope_detect)
#que solo sean del training set


Negations detected: 
 [('niega', 499, 504), ('riesgo', 'negativo', 1313, 1321), ('niega', 446, 451), ('niega', 446, 451), ('niega', 2001, 2006), ('niega', 2206, 2211), ('no', 2474, 2476), ('no', 2521, 2523), ('positivo', 'negativo', 5271, 5279), ('positivo', 'negativo', 5289, 5297), ('positivo', 'negativo', 5320, 5328), ('niega', 621, 626), ('sin', 2464, 2467), ('sin', 2821, 2824), ('signo', 'negativo', 4850, 4858), ('no', 5978, 5980), ('no', 6034, 6036), ('no', 6034, 6036), ('no', 6056, 6058), ('sin', 9787, 9790), ('no', 9843, 9845), ('sin', 481, 485), ('sin', 2352, 2355), ('sin', 4581, 4584), ('funcion', 'ausencia de', 5834, 5845), ('niega', 504, 510), ('no', 2010, 2012), ('ausencia', 2942, 2950), ('sin', 8429, 8432), ('sin', 9078, 9081), ('niega', 355, 360), ('niega', 380, 385), ('niega', 402, 407), ('positivo', 'negativo', 564, 572), ('sin', 1709, 1712), ('no', 1428, 1430), ('no', 1121, 1123), ('no', 2720, 2722), ('niega', 534, 540), ('sin', 690, 693), ('sin', 690, 693), ('sin', 66

Uncertainity Detection

In [59]:
def uncertainity_detection(processed_texts, unc_pre, unc_pos, medical_words, i=0):
    UNC = []
    USCO = []
    for processed_text in processed_texts:
        for result in phrase_matching(processed_text, unc_pre+unc_pos):
            phrase_found, phrase, start_idx, end_idx = result
            if phrase_found:
                # Calculate the start token index
                if phrase in unc_pre:
                    start_token_index = max(next((i for i, token in enumerate(processed_text) if token[1] > start_idx), len(processed_text)) - 1, 0)
                    # Select the previous 5 tokens
                    previous_tokens = [token[0] for token in processed_text[max(start_token_index - 5, 0):start_token_index]]
                    for token in previous_tokens:
                        if token in medical_words:
                            # Append to NEG
                            UNC.append((token, phrase, start_idx, end_idx))
                            # Calculate start and end indices for NSCO
                            start_usco = processed_text[max(start_token_index - 5, 0)][1]  # Start index of first token in previous_tokens
                            end_usco = processed_text[start_token_index - 1][2]  # End index of last token in previous_tokens
                            # Append to NSCO
                            USCO.append((start_usco, end_usco))
                    
                if phrase in unc_pos:
                    start_token_index = max(next((i for i, token in enumerate(processed_text) if token[1] > start_idx), len(processed_text)) - 1, 0)
                    # Select the next 5 tokens
                    next_tokens = [token[0] for token in processed_text[start_token_index + 1:min(start_token_index + 6, len(processed_text))]]
                    for token in next_tokens:
                        if token in medical_words:
                            # Append to NEG
                            UNC.append((phrase, start_idx, end_idx))
                            # Calculate start and end indices for NSCO
                            start_usco = processed_text[start_token_index + 1][1] if start_token_index + 1 < len(processed_text) else processed_text[start_token_index][1]  # Start index of first token in next_tokens
                            end_usco = processed_text[min(start_token_index + 5, len(processed_text) - 1)][2]  # End index of last token in next_tokens
                            # Append to NSCO
                            USCO.append((start_usco, end_usco))
    print(UNC)
    print(USCO)
    print(len(UNC))
    return UNC, USCO
            

unc_detect, unc_detect_scopes = uncertainity_detection(processed_texts, unc_pre, unc_pos, med_words)


[('no', 2474, 2476), ('no', 2521, 2523), ('descartar', 5653, 5662), ('sin', 2464, 2467), ('sin', 2821, 2824), ('no', 5978, 5980), ('puede', 5984, 5989), ('no', 6034, 6036), ('no', 6034, 6036), ('no', 6056, 6058), ('sin', 9787, 9790), ('no', 9843, 9845), ('se orienta', 9931, 9941), ('orienta como', 9934, 9946), ('orienta', 9934, 9941), ('sin', 481, 485), ('sin', 2352, 2355), ('sin', 4581, 4584), ('riesgo', 'aunque', 8138, 8144), ('diagnostico', 'sospecha', 764, 772), ('no', 2010, 2012), ('sin', 8429, 8432), ('sin', 9078, 9081), ('sin', 1709, 1712), ('no', 1428, 1430), ('no', 1121, 1123), ('no', 2720, 2722), ('sin', 690, 693), ('sin', 690, 693), ('sin', 6694, 6697), ('no', 1135, 1137), ('sin', 1332, 1335), ('no', 1430, 1432), ('sin', 2210, 2213), ('sin', 2475, 2478), ('sin', 2840, 2843), ('sin', 2889, 2892), ('sin', 1343, 1346), ('no', 1908, 1910), ('nivel', 'parece', 2350, 2356), ('no', 326, 328), ('no', 355, 357), ('sin', 2616, 2619), ('no', 2637, 2639), ('sin', 4677, 4680), ('sin', 49

**Results**

In [60]:
def remove_punctuation_and_spaces(text):
    # Define translation table to remove punctuation and spaces
    translation_table = str.maketrans("", "", string.punctuation + " ")

    # Remove punctuation and spaces from the text using translate method
    cleaned_text = text.translate(translation_table)
    
    return cleaned_text

# Remove punctuation and spaces from each element in the list
negations = [remove_punctuation_and_spaces(item) for item in negations]
# Remove punctuation and spaces from each element in the list
uncertainties = [remove_punctuation_and_spaces(item) for item in uncertainties]

In [61]:
#Convert the negations tuple into a list with just the negation words detected.
neg_detections_text = []
for detection in neg_detect:
    for x in detection:
        if type(x)== str:
            neg_detections_text.append(x)

#Convert the uncertainty tuple into a list with just the negation words detected.
unc_detections_text = []
for detection in unc_detect:
    for x in detection:
        if type(x)== str:
            unc_detections_text.append(x)


def precision_neg_unc(true, pred):
    right = 0
    total_pred = len(pred)

    for token in pred:
        if token in true:
            right += 1

    precision = right / total_pred
    return precision

def calcular_recall(y_true, y_pred):
    # Initialize a counter to track the number of correct predictions.
    num_correctos = 0
    # Get the total number of true words.
    total_palabras = len(y_true)

    # Loop through each true token.
    for token in y_true:
        # If the text of the true token is in the texts of the predicted tokens, increment the count of correct predictions.
        if any(token == pred_token for pred_token in y_pred):
            num_correctos += 1

    # Calculate recall as the proportion of correct predictions out of total actual positives.
    recall = num_correctos / total_palabras if total_palabras > 0 else 0
    # Return the calculated recall.
    return recall

def calcular_f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
        return f1_score

precision_neg= precision_neg_unc(negations, neg_detections_text)
recall_neg = calcular_recall(negations, neg_detections_text)
f1_neg = calcular_f1_score(precision_neg, recall_neg)


In [62]:
precision_unc= precision_neg_unc(uncertainties, unc_detections_text)
recall_unc = calcular_recall(uncertainties, unc_detections_text)
f1_unc = calcular_f1_score(precision_unc, recall_unc)

In [63]:
def precision_neg_scopes(true_scopes, pred_scopes):
    right = 0
    total_pred = len(pred_scopes) #Total amount of predictions.

    for prediction in pred_scopes:
        pred_start, pred_end = prediction[0], prediction[1]

        for true in true_scopes:
            true_start, true_end = true[0], true[1]
            if (pred_start == true_start or pred_end == true_end): 
                right += 1
            elif abs(pred_start - true_start) <= 10 and abs(pred_end - true_end) <= 10: #using abs to get the absolute number and see if the pred matches true.
                right += 0.5
            

    precision = right / total_pred
    return precision

def calcular_recall_scopes(y_true_scopes, y_pred_scopes):
    # Initialize a counter to track the number of correct scope predictions.
    num_aciertos = 0
    # Get the total number of true scopes.
    total_predicciones = len(y_true_scopes)

    # Loop through each true scope.
    for true_scope in y_true_scopes:
        # Then loop through each predicted scope.
        for pred_scope in y_pred_scopes:
            # Extract the start and end indices from the true and predicted scopes.
            true_num1, true_num2 = true_scope
            pred_num1, pred_num2 = pred_scope
            # If both the start and end of the true scope are within 10 units of the predicted scope's start and end, 
            # increment the count of correct predictions.
            if abs(true_num1 - pred_num1) <= 10 and abs(true_num2 - pred_num2) <= 10:
                num_aciertos += 1
                # Break out of the inner loop if we find a match.
                break

    # Calculate recall as the proportion of correct predictions out of total actual positives.
    recall = num_aciertos / total_predicciones if total_predicciones > 0 else 0
    # Return the calculated recall.
    return recall

precision_neg_scope= precision_neg_scopes(negation_scopes, neg_scope_detect)
recall_neg_scope = calcular_recall_scopes(negation_scopes, neg_scope_detect)
f1_neg_scope = calcular_f1_score(precision_neg_scope, recall_neg_scope)

In [68]:
def precision_unc_scopes(true_scopes, pred_scopes):
    right = 0
    total_pred = len(pred_scopes)

    for prediction in pred_scopes:
        pred_start, pred_end = prediction[0], prediction[1]

        for true in true_scopes:
            true_start, true_end = true[0], true[1]
            if (pred_start == true_start or pred_end == true_end):
                right += 2
            elif abs(pred_start - true_start) <= 12 and abs(pred_end - true_end) <= 12:
                right += 1

    precision = right / total_pred

    return precision

precision_unc_scope = precision_unc_scopes(uncertainty_scopes, unc_detect_scopes)
recall_unc_scope = calcular_recall_scopes(uncertainty_scopes, unc_detect_scopes)
f1_score_unc_scope = calcular_f1_score(precision_unc_scope, recall_unc_scope)

In [69]:
print('PRECISIONS:  ')
print('Precision for Negations: ', precision_neg)
print('-------------------------------------------')
print('Precision for Negation scopes: ', precision_neg_scope)
print('-------------------------------------------')
print('Precision for Uncertanties: ', precision_unc)
print('-------------------------------------------')
print('Precision for Uncertanties scopes: ', precision_unc_scope)

print('\nRECALL:  ')
print('Recall for Negations: ', recall_neg)
print('-------------------------------------------')
print('Recall for Negation scopes: ', recall_neg_scope)
print('-------------------------------------------')
print('Reall for Uncertanties: ', recall_unc)
print('-------------------------------------------')
print('Recall for Uncertanties scopes: ', recall_unc_scope)

print('\nF1 SCORE:  ')
print('F1 score for Negations: ', f1_neg)
print('-------------------------------------------')
print('F1 score for Negation scopes: ', f1_neg_scope)
print('-------------------------------------------')
print('F1 score for Uncertanties: ', f1_unc)
print('-------------------------------------------')
print('F1 score for Uncertanties scopes: ', f1_score_unc_scope)


PRECISIONS:  
Precision for Negations:  0.8564356435643564
-------------------------------------------
Precision for Negation scopes:  0.40710382513661203
-------------------------------------------
Precision for Uncertanties:  0.8662420382165605
-------------------------------------------
Precision for Uncertanties scopes:  0.2922077922077922

RECALL:  
Recall for Negations:  0.3191489361702128
-------------------------------------------
Recall for Negation scopes:  0.17551963048498845
-------------------------------------------
Reall for Uncertanties:  0.26666666666666666
-------------------------------------------
Recall for Uncertanties scopes:  0.18181818181818182

F1 SCORE:  
F1 score for Negations:  0.46501209569035035
-------------------------------------------
F1 score for Negation scopes:  0.24528608406528543
-------------------------------------------
F1 score for Uncertanties:  0.4077961019490255
-------------------------------------------
F1 score for Uncertanties scopes: 