# **Rule-Based Algorithm**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- 
- 
- 

**Data loading**

In [47]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import string
from unidecode import unidecode

In [48]:
'''
# Loading the json file
loading = open("negacio_train_v2024.json")
training_data = json.load(loading)
'''

'\n# Loading the json file\nloading = open("negacio_train_v2024.json")\ntraining_data = json.load(loading)\n'

### **Pre-processing**
- Removing redacted entries
- Removing language mixes
- Solving misspelled words
- Removing patient information
- Tokenizing 

In [49]:
'''
# 1 Remove pacient information and redacted entries
def remove_pacient_info(text):
    # Remove lines starting with "nº historia clinica:" and ending with "motiu d'ingres"
    text = re.sub(r'nº historia clinica:.*?motiu d\'ingres', '', text, flags=re.DOTALL)
    # Remove lines starting with "nhc" and ending with "lopd"
    text = re.sub(r'nhc.*?lopd', '', text, flags=re.DOTALL)
    # Remove all asterisks '*'
    text = text.replace('*', '')
    return text


# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(text):
    # Define a translation table to map punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text


# 3 Spell cheking with language detection (Able to be turned ON/OFF)
# Load language models for Spanish and Catalan
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

def spell_check_and_lemmatize(text):
    # Detect the language of the text
    language = detect(text)
    # Initialize spell checker
    spell = SpellChecker(language='es')  # As most of the text is in Spanish

    # Tokenize the text using the appropriate language model
    if language == 'ca':
        doc = nlp_ca(text)
    else:
        doc = nlp_es(text)

    # Correct misspelled words and lemmatize tokens
    corrected_tokens = []
    for token in doc:
        # Check if the token is a punctuation or whitespace
        if not token.is_punct and not token.is_space:
            # Get the corrected version of the token
            corrected_token = spell.correction(token.text)
            # Lemmatize the corrected token
            corrected_token_lemma = token.lemma_ if token.lemma_ != '-PRON-' else corrected_token
            corrected_tokens.append(corrected_token_lemma)
        else:
            corrected_tokens.append(token.text)

    # Join the tokens back into text
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text


# 4 Tokenization with coordinates of the original text for (evaluation)
def tokenize_with_coordinates(text):
    # Tokenize the text while preserving the coordinates
    tokens_with_coordinates = []
    token_start = 0
    for token in re.finditer(r'\S+', text):
        token_text = token.group(0)
        token_end = token_start + len(token_text)
        tokens_with_coordinates.append((token_text, token_start, token_end))
        # Update token start position for the next token
        token_start = token_end
    return tokens_with_coordinates


# Main function to process the text
def pre_process_text(text, remove_punctuation_call=True, spell_check_call=True):
    
    # 1 Remove pacient information and redacted entries
    preprocessed_text = remove_pacient_info(text)

    # 2 Remove punctuation if specified
    if remove_punctuation_call:
        preprocessed_text = remove_punctuation(preprocessed_text)

    # 3 Spell check and lemmatize if specified
    if spell_check_call:
        preprocessed_text = spell_check_and_lemmatize(preprocessed_text)

    # 4 Tokenize the text with coordinates
    tokens_with_coordinates = tokenize_with_coordinates(preprocessed_text)

    return tokens_with_coordinates
'''

'\n# 1 Remove pacient information and redacted entries\ndef remove_pacient_info(text):\n    # Remove lines starting with "nº historia clinica:" and ending with "motiu d\'ingres"\n    text = re.sub(r\'nº historia clinica:.*?motiu d\'ingres\', \'\', text, flags=re.DOTALL)\n    # Remove lines starting with "nhc" and ending with "lopd"\n    text = re.sub(r\'nhc.*?lopd\', \'\', text, flags=re.DOTALL)\n    # Remove all asterisks \'*\'\n    text = text.replace(\'*\', \'\')\n    return text\n\n\n# 2 Remove Punctuation (Able to be turned ON/OFF)\ndef remove_punctuation(text):\n    # Define a translation table to map punctuation to None\n    translator = str.maketrans(\'\', \'\', string.punctuation)\n    # Remove punctuation using the translation table\n    text = text.translate(translator)\n    return text\n\n\n# 3 Spell cheking with language detection (Able to be turned ON/OFF)\n# Load language models for Spanish and Catalan\nnlp_es = spacy.load("es_core_news_sm")\nnlp_ca = spacy.load("ca_core

In [50]:
# Load language models for Spanish and Catalan
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

def remove_patient_info(tokens):
    # Define patterns to match patient information identifiers
    patterns = [
        r'nº\s?historia\s?clinica:',
        r'nºepisodi:',
        r'sexe:',
        r'data\s?de\s?naixement:',
        r'edat:',
        r'procedencia',
        r'servei\s?obstetricia',
        r'data\sd\'ingres',
        r'data\sd\'alta',
        r'ates\s?per',
        r'informe\sd\'alta\sd\'hospitalitzacio',
        r'motiu\sd\'ingres',
        r'nhc',
        r'lopd'
    ]
    # Join tokens into text for efficient pattern matching
    text = ' '.join([token[0] for token in tokens])
    # Remove tokens containing patient information identifiers
    for pattern in patterns:
        text = re.sub(pattern + '.*?(?=\\s|$)', '', text, flags=re.IGNORECASE)
    # Tokenize the modified text
    return tokenize_with_coordinates(text)



# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(tokens):
    # Remove punctuation from token text
    tokens_without_punct = [(token[0].translate(str.maketrans('', '', string.punctuation)), token[1], token[2]) for token in tokens]
    return tokens_without_punct

# 3 Spell checking with language detection (Able to be turned ON/OFF)
def spell_check_and_lemmatize(tokens):
    # Join tokens into text
    text = ' '.join([token[0] for token in tokens])
    # Detect the language of the text
    language = detect(text)
    # Initialize spell checker
    spell = SpellChecker(language='es')  # As most of the text is in Spanish

    # Tokenize the text using the appropriate language model
    if language == 'ca':
        doc = nlp_ca(text)
    else:
        doc = nlp_es(text)

    # Correct misspelled words and lemmatize tokens
    corrected_tokens = []
    for token in doc:
        # Check if the token is a punctuation or whitespace
        if not token.is_punct and not token.is_space:
            # Get the corrected version of the token
            corrected_token = spell.correction(token.text)
            # Lemmatize the corrected token
            corrected_token_lemma = token.lemma_ if token.lemma_ != '-PRON-' else corrected_token
            corrected_tokens.append((corrected_token_lemma, token.idx, token.idx + len(corrected_token_lemma)))
        else:
            corrected_tokens.append((token.text, token.idx, token.idx + len(token.text)))

    return corrected_tokens


def tokenize_with_coordinates(text):
    # Tokenize the text using spaCy
    doc = nlp_es(text)

    # Extract tokens with their start and end positions
    tokens_with_coordinates = [(token.text, token.idx, token.idx + len(token.text)) for token in doc if not token.is_space]

    return tokens_with_coordinates


# Remove empty tokens   
def remove_empty_tokens(tokens):
    # Filter out tokens with empty text
    non_empty_tokens = [(token[0], token[1], token[2]) for token in tokens if token[0]]
    return non_empty_tokens


# Main function to process the text
def pre_process_text(text, remove_punctuation_call=True, spell_check_call=True):
    # Tokenize the text with coordinates
    tokens_with_coordinates = tokenize_with_coordinates(text)

    # Apply each processing step to the tokens
    if remove_punctuation_call:
        tokens_with_coordinates = remove_punctuation(tokens_with_coordinates)

    if spell_check_call:
        tokens_with_coordinates = spell_check_and_lemmatize(tokens_with_coordinates)

    # Remove empty tokens
    tokens_with_coordinates = remove_empty_tokens(tokens_with_coordinates)

    # Remove extra patient information
    tokens_with_coordinates = tokens_with_coordinates[32:]  # nº-motiu
    tokens_with_coordinates = tokens_with_coordinates[:-5]  # nhc-lopd

    # Remove accent marks from tokens
    tokens_with_coordinates = [(unidecode(token[0]), token[1], token[2]) for token in tokens_with_coordinates]

    # Return the modified tokens
    return tokens_with_coordinates

Real Usage

In [51]:
# Load the JSON file
with open("negacio_train_v2024.json", "r", encoding="utf-8") as file:
    training_data = json.load(file)

# List to store processed texts
processed_texts = []

# Iterate over each entry in the training data
for entry in training_data:
    text = entry["data"]["text"]  # Extract the text from the JSON object
    processed_text = pre_process_text(text, remove_punctuation_call=True, spell_check_call=False)
    processed_texts.append(processed_text)

# Printing (not necessary)
for processed_text in processed_texts:
    for token, start, end in processed_text:
        print(f"Token: {token}, Start: {start}, End: {end}")

Token: dingres, Start: 306, End: 314
Token: paciente, Start: 315, End: 323
Token: que, Start: 324, End: 327
Token: ingresa, Start: 328, End: 335
Token: de, Start: 336, End: 338
Token: forma, Start: 339, End: 344
Token: programada, Start: 345, End: 355
Token: para, Start: 356, End: 360
Token: realizacion, Start: 361, End: 372
Token: de, Start: 373, End: 375
Token: uretrotomia, Start: 376, End: 387
Token: interna, Start: 388, End: 395
Token: antecedents, Start: 398, End: 409
Token: alergia, Start: 410, End: 417
Token: a, Start: 418, End: 419
Token: penicilina, Start: 420, End: 430
Token: y, Start: 431, End: 432
Token: cloramfenicol, Start: 433, End: 446
Token: no, Start: 449, End: 451
Token: habitos, Start: 452, End: 459
Token: toxicos, Start: 460, End: 467
Token: antecedentes, Start: 469, End: 481
Token: medicos, Start: 482, End: 489
Token: bloqueo, Start: 491, End: 498
Token: auriculoventricular, Start: 499, End: 518
Token: de, Start: 519, End: 521
Token: primer, Start: 522, End: 528
T

In [52]:
print(processed_text)
print(processed_texts[100])
print(processed_texts[120])
print(processed_texts[20])

[('motiu', 282, 287), ('dingres', 288, 296), ('paciente', 297, 305), ('de', 306, 308), ('8', 309, 310), ('anos', 311, 315), ('que', 316, 319), ('ingresa', 320, 327), ('el', 328, 330), ('dia', 331, 334), ('08102019', 335, 345), ('en', 346, 348), ('planta', 349, 355), ('de', 356, 358), ('hospitalizacion', 359, 374), ('de', 375, 377), ('pediatria', 378, 387), ('por', 388, 391), ('sindrome', 392, 400), ('febril', 401, 407), ('antecedents', 409, 420), ('no', 421, 423), ('alergias', 424, 432), ('conocidas', 433, 442), ('calendario', 444, 454), ('vacunal', 455, 462), ('al', 463, 465), ('dia', 466, 469), ('antecedentes', 471, 483), ('patologicos', 484, 495), ('en', 499, 501), ('seguimiento', 502, 513), ('por', 514, 517), ('cardiologia', 518, 529), ('por', 530, 533), ('ligera', 534, 540), ('hipertrofia', 541, 552), ('ventricular', 553, 564), ('en', 565, 567), ('los', 568, 571), ('primeros', 572, 580), ('anos', 581, 585), ('de', 586, 588), ('vida', 589, 593), ('estudio', 595, 602), ('cardiologic

**Pre-processing json object**

- Reading the json object to obtain a list of the Negation and Uncertainty proc words as well as their scopes

- The scopes and annotations will be used for the supervised learning as the 'target'

In [53]:
# Loading the json object
loading = open("negacio_train_v2024.json", encoding="utf-8")
for_object = loading.read()
object = json.loads(for_object)

In [54]:
# Prepare lists to store the results
negations = []
negation_scopes = []
uncertainties = []
uncertainty_scopes = []

# Process the documents and obtain negation and uncertainty annotations
for item in object:
    text_data = item['data']['text']
    for prediction in item['predictions']:
        for result in prediction['result']:
            labels = result['value']['labels']
            start_index = result['value']['start']
            end_index = result['value']['end']
            text_segment = item['data']['text'][start_index:end_index]

            if "NEG" in labels:
                negations.append(text_segment)

                # Find the scope of the negation
                scope_start = -1
                scope_end = -1

                # Search for the scope starting just after the negation
                current_index = prediction['result'].index(result)
                if current_index < len(prediction['result']) - 1:
                    next_result = prediction['result'][current_index + 1]
                    next_start = next_result['value']['start']
                    next_end = next_result['value']['end']
                    next_labels = next_result['value']['labels']
                    if "NSCO" in next_labels and next_start == end_index:
                        scope_start = next_start
                        scope_end = next_end

                current_index = prediction['result'].index(result)
                if current_index > 0:
                    prev_result = prediction['result'][current_index - 1]
                    prev_start = prev_result['value']['start']
                    prev_end = prev_result['value']['end']
                    prev_labels = prev_result['value']['labels']
                    if "NSCO" in prev_labels and prev_end == start_index:
                        scope_start = prev_start
                        scope_end = prev_end

                if scope_start != -1 and scope_end != -1:
                    scope_text = item['data']['text'][scope_start:scope_end]
                    negation_scopes.append((start_index, scope_end))

            if "UNC" in labels:
                uncertainties.append(text_segment)

                # Find the scope of the uncertainty
                scope_start = -1
                scope_end = -1

                # Search for the scope starting just after the uncertainty
                current_index = prediction['result'].index(result)
                if current_index < len(prediction['result']) - 1:
                    next_result = prediction['result'][current_index + 1]
                    next_start = next_result['value']['start']
                    next_end = next_result['value']['end']
                    next_labels = next_result['value']['labels']
                    if "USCO" in next_labels and next_start == end_index:
                        scope_start = next_start
                        scope_end = next_end

                current_index = prediction['result'].index(result)
                if current_index > 0:
                    prev_result = prediction['result'][current_index - 1]
                    prev_start = prev_result['value']['start']
                    prev_end = prev_result['value']['end']
                    prev_labels = prev_result['value']['labels']
                    if "USCO" in prev_labels and prev_end == start_index:
                        scope_start = prev_start
                        scope_end = prev_end

                if scope_start != -1 and scope_end != -1:
                    scope_text = item['data']['text'][scope_start:scope_end]
                    uncertainty_scopes.append((start_index, scope_end))

# Remove whitespaces from negations and uncertainties if they are at the end
negations = [negation.rstrip() for negation in negations]
uncertainties = [uncertainty.rstrip() for uncertainty in uncertainties]
# Remove duplicates from the annotation lists
negations = list(set(negations))
negation_scopes = list(set(negation_scopes))
uncertainties = list(set(uncertainties))
uncertainty_scopes = list(set(uncertainty_scopes))



# Print the negation/uncertainty annotations and their scopes
print("Negations\n", negations)
print("Negation Scopes\n", negation_scopes)
print("Uncertainties\n", uncertainties)
print("Uncertainty Scopes\n", uncertainty_scopes)

Negations
 ['negativo)', 'sense', 'negativas.', 'desaparicion del', 'afebril', 'se retira', 'impide', 'cede', 'afebril.', 'indetectable.', 'niega', 'negativos,', 'asintomatica,', 'sin', 'inespecificos', 'en ninguna', 'descartada', 'desorientado', 'asintomatico', 'desaparecen', 'negatividad', 'no', 'negativa.', 'irregulares', 'negaitvo', 'excepto', 'asintomatico,', 'negativos', 'neg;', 'ceden', 'afebril,', 'desorientacion.', 'exfumador', 'ninguno', 'neg.', 'negatividad del', 'asintomatica', 'ausencia de', 'ex', 'negativas', 'descarta', 'retirar', 'ex fumador', 'incapacidad para', 'negativa', 'imposibilidad', 'nega', 'negativos.', 'negativo', ' afebril', 'inestabilidad', 'asintomatico.', 'desaparicion de', ' no', 'neg', 'ausencia', 'negativa,', 'tampoco', 'neg,', 'inespecifico.', 'rechaza', 'atipicos', 'inespecifico:', 'inespecifico', 'negativa)', 'imposibilidad de', 'negativo.', 'falta de', 'arritmicos', 'negatiu.', 'negatiu', 'niegan', 'negativos;', 'retiro', 'negativo,', 'negatividad 

**Implementation**
____________________________________________________________

Medical Words

In [55]:
#MEDICAL WORDS FROM TRAINING SET + MANUALY ADDED
med_words = ['secuelas', 'positivo','positiva','cuadro','patron', 'terapeutico', 'mejora', 'terapia', 'alteracion', 'reaccion', 'farmacoterapias', 'malestares', 'anormalidades', 'indicacion', 'desordenes', 'lesiones', 'farmacoterapia', 'respuesta', 'sindromes', 'desviaciones', 'sensibilidad', 'diagnostico', 'tratamiento', 'dolor', 'examenes', 'molestias', 'farmacoterapeutico', 'deterioros', 'enfermedad', 'exposicion', 'resultado', 'riesgo', 'sintomas', 'deficiencia', 'efecto', 'toxicos', 'eficacia', 'toxicidad', 'presencia', 'inmunidad', 'tratamientos', 'discapacidades', 'danos', 'intolerancia', 'deformidades', 'prevencion', 'afecciones', 'rechazo', 'lesion', 'examen', 'farmacoterapeuticos', 'farmacos', 'patologias', 'dolencias', 'diagnosticos', 'prueba', 'analisis', 'complicaciones', 'patologia', 'anomalias', 'infecciones', 'disfunciones', 'padecimientos', 'progresion', 'agravamientos', 'infeccion', 'efectos', 'nivel', 'sintoma', 'condicion', 'trastornos', 'pruebas', 'farmacologico', 'sindrome', 'concentracion', 'capacidad', 'hallazgo', 'secuela', 'afectaciones', 'inflamacion', 'manifestaciones', 'deteccion', 'enfermedades', 'dolores', 'signo', 'funcion', 'complicacion', 'adversidades', 'resistencia', 'problemas', 'absorcion']

Negation Words / Phrases

In [56]:
#EXTRACTED FROM THE TRAINING SET
neg_pre = ['impide', 'exfumador', 'negativo', 'desaparecen', 'asintomatica', 'afebril', 'se desestimo', 'negativos', 'ninguno', 'asintomatico', 'desorientado', 'inestabilidad', 'atipicos', 'ausencia de', 'ceden', 'negativa', 'negativas', 'excepto', 'desorientacion', 'inespecificos', 'se suspende','inespecifico', 'ex', 'arritmicos', 'cede', 'se retira', 'ex fumador', 'niegan', 'negatiu', 'negaitvo', 'indetectable',' negativo', 'suspendido']
neg_pos = ['retirar','ni' 'desaparicion de', 'descarta', 'ausencia', 'descartada', 'niega', 'nega', 'rechaza', 'desaparicion del', 'imposibilidad', 'retiro', 'irregulares', 'negatividad', 'tampoco', 'sin', 'imposibilidad de', 'en ninguna', 'incapacidad para', ' no', 'neg', ' afebril', 'sense', 'falta de', 'negatividad de', 'negatividad del', 'no']

In [57]:
#EXTRACTED FROM THE GITHUB
neg_pre_filtered= ['gobierna al paciente', 'ninguna otra evidencia', 'la van descartar per', 'no aparece', 'descartaron al paciente por', 'adecuado para descartarla', 'excluir', 'excloure', 'descartar al pacient per', 'sin ninguna evidencia de', 'descartarlo por', 'no tenía', 'lo descartó', 'sin signo de', 'sense indicació de', 'pot descartar', 'excluye', 'ho descarta', 'ninguna evidencia radiográfica de', 'descartarlo', 'negando', 'sense troballes de', 'adequat per a descartar-lo', 'cap senyal de', 'no em queixo de', 'ninguna señal de', 'nunca tuve', 'ho va descartar', 'descartar', 'lo descartaron por', 'la va descartar contra', 'pot descartar-ho', 'lo descartaron en contra', 'libre de', 'pot descartar al pacient', 'no tinc', 'absència de', 'sin quejas de', 'van descartar contra', 'la va descartar', 'no tenia', 'descartado contra', 'ho va descartar contra', 'no me quejo de', 'puede descartar', 'no significativo', 'resuelto', 'puede descartar al paciente', 'no sospitós', 'gens especial per a', 'pot descartar-ho per', 'exclou', 'r / o', 'resolt', 'van descartar per a', 'descartó', 'va descartar', 'puede descartarla contra', 'sin evidencia', 'descartar per a', 'no apareix', 'pot descartar-la per', 'la descartó', 'va descartar al pacient', 'descartar-ho', 'cap altra evidència', 'no saber de', 'ninguna nueva evidencia', 'sin indicación de', 'mai desenvolupat', 'sense queixes de', "l'exclou", 'lo descarta', 'cap suggeriment de', 'cap evidència radiogràfica de', 'sense cap evidència de', 'puede descartarlo', 'pacient no era', 'no apreciar', 'con ningún', 'cap causa de', 'adecuado para descartarlo', 'no associat amb', 'descartarla', 'nunca desarrollado', 'adecuado para descartarla por', 'no poden veure', 'ro', 'ninguna evidencia para sugerir', 'sin hallazgos de', 'no pueden ver', 'pot descartar-la', 'la excluye', 'governa al pacient', 'amb cap', 'descartar al paciente por', 'expulsó al paciente por', 'no anormal', 'no sospechoso', 'pot descartar-la contra', 'nada nuevo', 'paciente no era', 'descartaron para', 'no poder', 'evaluar por', 'suficiente para descartarlo por', 'no exhibir', 'més aviat que', 'fer una prova per', 'puede descartarla por', 'ninguna causa de', 'lliure de', 'mai vaig tenir', 'no tengo', 'descartaron contra', 'puede descartarla', 'adecuado para descartarlo por', 'descartat contra', 'descartar-la', 'adequat per a descartar-la', 'no significatiu', 'negatiu per a', 'sense evidència', 'ho van descartar per', 'descartar al pacient', 'cap nova evidència', 'descartarla por', 'nada especial para', 'ninguna sugerencia de', 'sense signe de', 'no sentir', 'descartar al paciente', 'descartar-ho per', 'puede descartarlo en contra', 'puede descartarlo por', 'más bien que', 'res nou', 'descartó al paciente', 'negant', 'avaluar per', 'no demostrar', 'descartar para', 'no revela', 'no revelar', 'descartó al paciente contra', 'puede descartar contra', 'descartar-la per', 'suficiente para descartar', 'negativo para', 'la descartó contra', 'revisado para', 'suficient per a descartar', 'revisat per a', 'hacer una prueba por', 'pot descartar-ho en contra', 'pot descartar contra', 'la descartaron por', 'descartaron al paciente contra', 'no asociado con', 'suficiente para descartarla por', 'lo descartó contra']
neg_pos_filtered= ['libre', 'podría ser descartado', 'fue descartado', 'rechazado', 'ha de ser descartat', 'lliure', 'rebutjat', 'puede ser descartado', 'ser descartat', 'adecuado para descartar', 'están descartadas', 'podria ser descartat', 'improbable', 'podría ser descartado por', 'debe ser descartado por', 'suficient per a descartar-ho', 'serà descartat per', 'podria descartar-se', 'suficient per a descartar-la', 'va ser descartat', 'han estat descartades', 'ser descartado por', 'declina', 'adequat per a descartar', 'siendo descartado', 'suficiente para descartarla', 'no ver', 'sent descartat', 'està descartat', 'se puede descartar por', 'pot ser descartat', 'ha estat descartat', 'puede ser descartado para', 'ha de descartar-se', 'suficiente para descartarlo', 'es descarta', 'ha sido descartado', 'será descartado', 'ser descartado', 'no veure', 'ser descartat per', 'debe ser descartado', 'serà descartat', 'debe descartarse para', 'debe ser descartado para', 'es pot descartar per', 'negado', 'se descarta', 'lo descartaron', 'no ser', 'se puede descartar', 'está descartado', 'negat', 'podría descartarse', 'es pot descartar', 'ho van descartar', 'podria ser descartat per', 'han sido descartadas', 'será descartado por', 'estan descartades', 'debe descartarse']
neg_pre.extend(neg_pre_filtered)
neg_pos.extend(neg_pos_filtered)

Uncertainty Words / Phrases

In [58]:
#NOSE COM EMFOCAR: SIN, NO, DESCARTAR
unc_pre = [ 'al parecer' , 'vs', 'dudosamente', 'indeterminado', 'sospecha', 'pudieran', 'aparentes', 'dubtos','permite descartar', 'parece', 'atribuida', 'clara', 'no clara', 'desconocido']
unc_pos = ['compatible amb', 'desconoce', 'indiquen', 'sin aparente', 'sugieren', 'ssospechosas de', 'probablemente', 'posible', 'sugestivo de', 'falsa', 'sospechan de', 'posibilidad de', 'sugiriendo', 'orienta', 'sospechosos de', 'sugestivos de', 'se orienta', 'plantea', 'podria', 'puede', 'podrian', 'probables', 'no', 'sugiere', 'parecen', 'sin', 'sospechosa de', 'sugestivas de', 'orientan como', 'dudosa', 'interpreta', 'compatible con', 'valorar', 'dudosos', 'probable', 'poco porque', 'sugieran', 'sin clara', 'no permite descartar', 'se desconoce', 'impresiona de', 'sugestiva de', 'orienta como', 'orientan', 'sin poder descartar', 'no parece', 'sospitosa de', 'sugestivos con', 'impresiona', 'aparentemente', 'sospecha de', 'no es posible descartar', 'compatibles con', 'compatible', 'aparente', 'sugestiva como', 'posiblemente', 'posibles', 'sugiera de', 'descartar', 'dudoso', 'se orientan', 'sospechosas de', 'sin aparentes', 'sin claras']   

In [59]:
#DEMA HO AFEGEIXO
unc_pre_filtered = ['como una causa secundaria para', 'como el origen secundario de', 'secundario a', 'como la causa de', 'como la causa secundaria de', 'como una razón de', 'como una causa secundaria de', 'como una etiología secundaria para', 'excepto', 'aunque', 'como la fuente secundaria para', 'como una etilogía para', 'como la fuente de', 'como el origen secundario para', 'com la font secundària de', 'como una razón secundaria para', 'com una raó secundària per a', 'com la font secundària per a', 'com una etiologia de', 'como una etiología secundaria de', 'secundari a', 'encara que', 'como una razón secundaria de', 'como la razón secundaria de', 'como la fuente secundaria de', 'com la causa secundària de', 'a pesar que', 'como una razón para', 'como la etilogía de', "com l'etiologia de", 'como la razón secundaria para', 'como un origen secundario para', 'com una font secundària per a', 'como una etilogía de', 'como fuente de', "com l'etiologia secundària per a", "com l'origen secundari per a", "com l'origen secundari de", 'però', 'com a causa de', 'com una etiologia secundària per a', 'como la razón de', 'com un origen secundari per a', 'com una raó secundària de', 'no obstant això', 'com una font secundària de', 'com una raó per a', 'pero', 'com la raó secundària de', 'como la etilogía secundaria para', 'a pesar de que', 'sin embargo', "com l'etiologia secundària de", 'com la causa de', 'com una causa secundària per a', 'com la raó de', 'a part de', 'como una fuente secundaria de', 'com una causa secundària de', 'aparte de', 'com a font de', 'como una fuente secundaria para', "com l'origen de", 'com una etiologia per a', 'com la raó secundària per a', 'como el origen de', 'com un origen secundari de', 'com una raó de', 'como un origen secundario de', 'excepte', 'como causa de', 'encara', 'todavía', 'com una etiologia secundària de', 'como la etilogía secundaria de']
unc_post_filtered =['origens de', 'raons de', 'altres possibilitats de', 'font per a', 'desencadenar evento para', 'origen para', 'raons per a', 'fuente para', 'causes de', 'fuentes de', 'causa de', 'etilogia de', 'etilogía para', 'origen per a', 'motivo de', 'fuentes para', 'otras posibilidades de', 'razones de', 'causas de', 'font de', 'razones para', 'etilogía de', 'fonts per a', 'fonts de', 'raó per a', 'razón de', 'orígenes para', 'razón para', 'motiu de', 'desencadenar esdeveniment per a', 'fuente de', 'orígenes de', 'origen de', 'raó de', 'etilogia per a', 'origens per a']
unc_pre.extend(unc_pre_filtered)
unc_pos.extend(unc_post_filtered)

Detect a Phrase

In [60]:
#FUNCTION TO DETECT A WORD / PHRASE MATCH
def phrase_matching(tokenized_text, word_or_phrases):
    text_length = len(tokenized_text)
    # Sort negation phrases by length in descending order
    word_or_phrases = sorted(word_or_phrases, key=len, reverse=True)
    for i in range(text_length):
        for phrase in word_or_phrases:
            phrase_tokens = phrase.split()
            phrase_length = len(phrase_tokens)
            if i + phrase_length <= text_length:  # Check if remaining tokens are enough to match the phrase
                joined_tokens = ' '.join([token[0] for token in tokenized_text[i:i+phrase_length]])
                if ' '.join(joined_tokens.split()) == phrase:  # Remove extra spaces and then check if consecutive tokens match the phrase
                        yield True, phrase, tokenized_text[i][1], tokenized_text[i+phrase_length-1][2]
                
    return False, None, None, None

Negation Detection

In [61]:
def negation_detection(processed_texts, neg_pre, neg_pos, medical_words, i=0):
    NEG = []
    NSCO = []
    for processed_text in processed_texts:
        for result in phrase_matching(processed_text, neg_pre+neg_pos):
            phrase_found, phrase, start_idx, end_idx = result
            if phrase_found:
                # Calculate the start token index
                if phrase in neg_pre:
                    start_token_index = max(next((i for i, token in enumerate(processed_text) if token[1] > start_idx), len(processed_text)) - 1, 0)
                    # Select the previous 5 tokens
                    previous_tokens = [token[0] for token in processed_text[max(start_token_index - 5, 0):start_token_index]]
                    for token in previous_tokens:
                        if token in medical_words:
                            # Append to NEG
                            NEG.append((token, phrase, start_idx, end_idx))
                            # Calculate start and end indices for NSCO
                            start_nsco = processed_text[max(start_token_index - 5, 0)][1]  # Start index of first token in previous_tokens
                            end_nsco = processed_text[start_token_index - 1][2]  # End index of last token in previous_tokens
                            # Append to NSCO
                            NSCO.append((start_nsco, end_nsco))
                    
                if phrase in neg_pos:
                    start_token_index = max(next((i for i, token in enumerate(processed_text) if token[1] > start_idx), len(processed_text)) - 1, 0)
                    # Select the next 5 tokens
                    next_tokens = [token[0] for token in processed_text[start_token_index + 1:min(start_token_index + 6, len(processed_text))]]
                    for token in next_tokens:
                        if token in medical_words:
                            # Append to NEG
                            NEG.append((phrase, start_idx, end_idx))
                            # Calculate start and end indices for NSCO
                            start_nsco = processed_text[start_token_index + 1][1] if start_token_index + 1 < len(processed_text) else processed_text[start_token_index][1]  # Start index of first token in next_tokens
                            end_nsco = processed_text[min(start_token_index + 5, len(processed_text) - 1)][2]  # End index of last token in next_tokens
                            # Append to NSCO
                            NSCO.append((start_nsco, end_nsco))
    return NEG, NSCO
            

neg_detect, neg_scope_detect = negation_detection(processed_texts, neg_pre, neg_pos, med_words)
print('Negations detected: \n',neg_detect)
print('Negations scopes detected: \n',neg_scope_detect)
#que solo sean del training set


Negations detected: 
 [('no', 449, 451), ('no', 2097, 2099), ('no', 3399, 3401), ('sin', 3736, 3739), ('no', 3854, 3856), ('sin', 549, 552), ('no', 1241, 1243), ('no', 1276, 1278), ('no', 400, 402), ('no', 407, 409), ('sin', 1973, 1976), ('positivo', 'negativo', 2674, 2682), ('positivo', 'negativos', 1191, 1200), ('positivo', 'negativos', 1191, 1200), ('prueba', 'negativa', 1231, 1239), ('niega', 444, 449), ('niega', 460, 465), ('sin', 685, 688), ('sin', 1642, 1645), ('cuadro', 'inestabilidad', 974, 987), ('sin', 1196, 1199), ('sin', 2480, 2483), ('sin', 5981, 5984), ('niega', 452, 457), ('sin', 1642, 1645), ('no', 557, 559), ('sin', 2057, 2060), ('no', 517, 520), ('no', 1811, 1813), ('no', 4664, 4666), ('sin', 2660, 2663), ('sin', 3553, 3556), ('no', 1119, 1121), ('sin', 1622, 1625), ('no', 450, 452), ('no', 1316, 1318), ('positivo', 'negativo', 2542, 2550), ('no', 650, 652), ('no', 347, 349), ('sin', 643, 646), ('sin', 643, 646), ('sin', 4554, 4557), ('no', 5247, 5249), ('sin', 1119,

Uncertainity Detection

In [62]:
def uncertainity_detection(processed_texts, unc_pre, unc_pos, medical_words, i=0):
    UNC = []
    USCO = []
    for processed_text in processed_texts:
        for result in phrase_matching(processed_text, unc_pre+unc_pos):
            phrase_found, phrase, start_idx, end_idx = result
            if phrase_found:
                # Calculate the start token index
                if phrase in unc_pre:
                    start_token_index = max(next((i for i, token in enumerate(processed_text) if token[1] > start_idx), len(processed_text)) - 1, 0)
                    # Select the previous 5 tokens
                    previous_tokens = [token[0] for token in processed_text[max(start_token_index - 5, 0):start_token_index]]
                    for token in previous_tokens:
                        if token in medical_words:
                            # Append to NEG
                            UNC.append((token, phrase, start_idx, end_idx))
                            # Calculate start and end indices for NSCO
                            start_usco = processed_text[max(start_token_index - 5, 0)][1]  # Start index of first token in previous_tokens
                            end_usco = processed_text[start_token_index - 1][2]  # End index of last token in previous_tokens
                            # Append to NSCO
                            USCO.append((start_usco, end_usco))
                    
                if phrase in unc_pos:
                    start_token_index = max(next((i for i, token in enumerate(processed_text) if token[1] > start_idx), len(processed_text)) - 1, 0)
                    # Select the next 5 tokens
                    next_tokens = [token[0] for token in processed_text[start_token_index + 1:min(start_token_index + 6, len(processed_text))]]
                    for token in next_tokens:
                        if token in medical_words:
                            # Append to NEG
                            UNC.append((phrase, start_idx, end_idx))
                            # Calculate start and end indices for NSCO
                            start_usco = processed_text[start_token_index + 1][1] if start_token_index + 1 < len(processed_text) else processed_text[start_token_index][1]  # Start index of first token in next_tokens
                            end_usco = processed_text[min(start_token_index + 5, len(processed_text) - 1)][2]  # End index of last token in next_tokens
                            # Append to NSCO
                            USCO.append((start_usco, end_usco))
    print(UNC)
    print(USCO)
    print(len(UNC))
    return UNC, USCO
            

unc_detect, unc_detect_scopes = uncertainity_detection(processed_texts, unc_pre, unc_pos, med_words)
#que solo sean del training set


[('no', 449, 451), ('lesiones', 'pero', 895, 899), ('falsa', 1902, 1907), ('no', 2097, 2099), ('no', 3399, 3401), ('sin', 3736, 3739), ('no', 3854, 3856), ('aparente', 6848, 6856), ('dudosa', 6880, 6886), ('posible', 7750, 7757), ('posible', 7750, 7757), ('dudosa', 7894, 7900), ('dudosa', 7894, 7900), ('sin', 549, 552), ('no', 1241, 1243), ('aparente', 1266, 1274), ('no', 1276, 1278), ('compatible con', 4187, 4201), ('compatible', 4187, 4197), ('compatible con', 4822, 4836), ('compatible', 4822, 4832), ('cuadro', 'aunque', 4852, 4858), ('no', 400, 402), ('no', 407, 409), ('sin', 1973, 1976), ('dolor', 'sospecha', 2812, 2820), ('sin', 685, 688), ('sin', 1642, 1645), ('sin', 1196, 1199), ('sin', 2480, 2483), ('probables', 3331, 3340), ('sin', 5981, 5984), ('probable', 6010, 6018), ('sin', 1642, 1645), ('no', 557, 559), ('sin', 2057, 2060), ('no', 517, 520), ('no', 1811, 1813), ('no', 4664, 4666), ('compatible con', 4716, 4730), ('compatible', 4716, 4726), ('sin', 2660, 2663), ('sin', 355

**Results**

In [63]:
def remove_punctuation_and_spaces(text):
    # Define translation table to remove punctuation and spaces
    translation_table = str.maketrans("", "", string.punctuation + " ")

    # Remove punctuation and spaces from the text using translate method
    cleaned_text = text.translate(translation_table)
    
    return cleaned_text

# Remove punctuation and spaces from each element in the list
negations = [remove_punctuation_and_spaces(item) for item in negations]

In [64]:


#Convert the negations tuple into a list with just the negation words detected.
neg_detections_text = []
for detection in neg_detect:
    for x in detection:
        if type(x)== str:
            neg_detections_text.append(x)


def precision_neg_unc(true, pred):
    num_correctos = 0
    total_palabras = len(pred)

    for token in pred:
        if token in true:
            num_correctos += 1

    precision = num_correctos / total_palabras
    return precision

precision_neg = precision_neg_unc(negations, neg_detections_text)


In [65]:
print(neg_scope_detect)
print(negation_scopes)

[(452, 498), (2100, 2129), (3402, 3444), (3740, 3773), (3857, 3888), (553, 579), (1244, 1287), (1279, 1320), (403, 437), (410, 449), (1977, 2012), (2638, 2673), (1147, 1190), (1147, 1190), (1191, 1230), (450, 495), (466, 523), (689, 747), (1646, 1690), (938, 973), (1200, 1242), (2484, 2511), (5985, 6037), (458, 497), (1646, 1681), (560, 599), (2061, 2096), (521, 577), (1814, 1855), (4667, 4726), (2664, 2715), (3557, 3599), (1122, 1158), (1626, 1657), (453, 485), (1319, 1347), (2508, 2541), (653, 684), (350, 386), (647, 682), (647, 682), (4558, 4590), (5250, 5293), (1123, 1154), (1180, 1216), (2217, 2267), (2488, 2535), (2657, 2708), (3620, 3659), (7233, 7279), (8783, 8810), (8957, 8989), (8957, 8989), (426, 483), (618, 666), (1383, 1426), (2521, 2563), (2873, 2924), (4073, 4111), (4073, 4111), (4087, 4123), (4087, 4123), (7343, 7389), (7343, 7389), (8707, 8753), (421, 472), (421, 472), (530, 568), (2003, 2042), (3270, 3310), (3305, 3343), (443, 494), (2388, 2431), (2388, 2431), (1181, 

In [66]:
def precision_neg_scopes(true_scopes, pred_scopes):
    right = 0 #Variable to store correct predictions.
    total_pred = len(pred_scopes) #Total amount of predictions.

    for prediction in pred_scopes:
        pred_start, pred_end = prediction[0], prediction[1]

        for true in true_scopes:
            true_start, true_end = true[0], true[1]
            if (pred_start == true_start or pred_end == true_end): #using abs to get the absolute number and see if the pred matches true.
                right += 1
                break #Found in gorund truth, we can go to the next iteration for prediction.
    precision = right / total_pred
    return precision

precision_neg_scope = precision_neg_scopes(negation_scopes, neg_scope_detect)

In [77]:
#Convert the negations tuple into a list with just the negation words detected.
unc_detections_text = []
for detection in unc_detect:
    for x in detection:
        if type(x)== str:
            unc_detections_text.append(x)

print(uncertainties)
print(unc_detections_text)

precision_unc = precision_neg_unc(uncertainties, unc_detections_text)


['vs.', 'posibles', 'interpreta', 'desconoce', 'orienta como', 'sin', 'sugiera de', 'valorar', 'compatible amb', 'indiquen', 'plantea', 'compatible', 'indeterminado.', 'compatibles con', 'probables', 'sin poder descartar', 'parece', 'puede', 'ssospechosas de', 'descartar', 'clara', 'sugiriendo', 'impresiona de', 'no', 'dubtos', 'dudosa', 'permite descartar', 'sin clara', 'sugieren', 'sugieran', 'sugestivos de', 'no clara', 'sin aparente', 'orientan como', 'dudosamente', 'compatible con', 'posiblemente', 'se orienta', 'falsa', 'pudieran', 'no es posible descartar', 'vs', 'aparente', 'se orientan', 'sospechosa de', 'sugestivas de', 'poco porque', 'sospitosa de', 'sugestiva de', 'sospechosos de', 'posible', 'dudosa.', 'sospecha de', 'aparentemente', 'orientan', 'dudosos', 'probablemente', 'podria', 'dudoso.', 'parecen', 'no permite descartar', 'orienta', 'sin claras', 'atribuida', 'sin aparentes', 'no parece', 'probable', 'sospechosas de', 'al parecer', 'sugestivo de', 'dudoso', 'sugiere'

In [78]:
def precision_unc_scopes(true_scopes, pred_scopes):
    right = 0
    total_pred = len(pred_scopes)

    for prediction in pred_scopes:
        pred_start, pred_end = prediction[0], prediction[1]

        for true in true_scopes:
            true_start, true_end = true[0], true[1]
            if (pred_start == true_start or pred_end == true_end):
                right += 1

    precision = right / total_pred
    return precision

precision_unc_scope = precision_unc_scopes(uncertainty_scopes, unc_detect_scopes)

In [79]:
print('PRECISIONS:  ')
print('Precision for Negations: ', precision_neg)
print('-------------------------------------------')
print('Precision for Negation scopes: ', precision_neg_scope)
print('-------------------------------------------')
print('Precision for Uncertanties: ', precision_unc)
print('-------------------------------------------')
print('Precision for Uncertanties scopes: ', precision_unc_scope)


PRECISIONS:  
Precision for Negations:  0.8483754512635379
-------------------------------------------
Precision for Negation scopes:  0.45095367847411444
-------------------------------------------
Precision for Uncertanties:  0.9516129032258065
-------------------------------------------
Precision for Uncertanties scopes:  0.10136157337367625
