# **Rule-Based Algorithm**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- 
- 
- 

**Data loading**

In [69]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import string

In [70]:
# Loading the json file
loading = open("negacio_train_v2024.json")
training_data = json.load(loading)

### **Pre-processing**
- Removing redacted entries
- Removing language mixes
- Solving misspelled words
- Removing patient information
- Tokenizing 

In [71]:
# 1 Remove pacient information and redacted entries
def remove_pacient_info(text):
    # Remove lines starting with "nº historia clinica:" and ending with "motiu d'ingres"
    text = re.sub(r'nº historia clinica:.*?motiu d\'ingres', '', text, flags=re.DOTALL)
    # Remove lines starting with "nhc" and ending with "lopd"
    text = re.sub(r'nhc.*?lopd', '', text, flags=re.DOTALL)
    # Remove all asterisks '*'
    text = text.replace('*', '')
    return text


# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(text):
    # Define a translation table to map punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text


# 3 Spell cheking with language detection (Able to be turned ON/OFF)
# Load language models for Spanish and Catalan
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

def spell_check_and_lemmatize(text):
    # Detect the language of the text
    language = detect(text)
    # Initialize spell checker
    spell = SpellChecker(language='es')  # As most of the text is in Spanish

    # Tokenize the text using the appropriate language model
    if language == 'ca':
        doc = nlp_ca(text)
    else:
        doc = nlp_es(text)

    # Correct misspelled words and lemmatize tokens
    corrected_tokens = []
    for token in doc:
        # Check if the token is a punctuation or whitespace
        if not token.is_punct and not token.is_space:
            # Get the corrected version of the token
            corrected_token = spell.correction(token.text)
            # Lemmatize the corrected token
            corrected_token_lemma = token.lemma_ if token.lemma_ != '-PRON-' else corrected_token
            corrected_tokens.append(corrected_token_lemma)
        else:
            corrected_tokens.append(token.text)

    # Join the tokens back into text
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text


# 4 Tokenization with coordinates of the original text for (evaluation)
def tokenize_with_coordinates(text):
    # Tokenize the text while preserving the coordinates
    tokens_with_coordinates = []
    token_start = 0
    for token in re.finditer(r'\S+', text):
        token_text = token.group(0)
        token_end = token_start + len(token_text)
        tokens_with_coordinates.append((token_text, token_start, token_end))
        # Update token start position for the next token
        token_start = token_end
    return tokens_with_coordinates


# Main function to process the text
def pre_process_text(text, remove_punctuation_call=True, spell_check_call=True):
    
    # 1 Remove pacient information and redacted entries
    preprocessed_text = remove_pacient_info(text)

    # 2 Remove punctuation if specified
    if remove_punctuation_call:
        preprocessed_text = remove_punctuation(preprocessed_text)

    # 3 Spell check and lemmatize if specified
    if spell_check_call:
        preprocessed_text = spell_check_and_lemmatize(preprocessed_text)

    # 4 Tokenize the text with coordinates
    tokens_with_coordinates = tokenize_with_coordinates(preprocessed_text)

    return tokens_with_coordinates

Real Usage

In [82]:
for entry in training_data:
    text = entry["data"]["text"]  # Extract the text from the JSON object
    processed_text = pre_process_text(text, remove_punctuation_call=True, spell_check_call=False)
    # Printing (not necessary)
    for token, start, end in processed_text:
        print(f"Token: {token}, Start: {start}, End: {end}")

Token: paciente, Start: 0, End: 8
Token: que, Start: 8, End: 11
Token: ingresa, Start: 11, End: 18
Token: de, Start: 18, End: 20
Token: forma, Start: 20, End: 25
Token: programada, Start: 25, End: 35
Token: para, Start: 35, End: 39
Token: realizacion, Start: 39, End: 50
Token: de, Start: 50, End: 52
Token: uretrotomia, Start: 52, End: 63
Token: interna, Start: 63, End: 70
Token: antecedents, Start: 70, End: 81
Token: alergia, Start: 81, End: 88
Token: a, Start: 88, End: 89
Token: penicilina, Start: 89, End: 99
Token: y, Start: 99, End: 100
Token: cloramfenicol, Start: 100, End: 113
Token: no, Start: 113, End: 115
Token: habitos, Start: 115, End: 122
Token: toxicos, Start: 122, End: 129
Token: antecedentes, Start: 129, End: 141
Token: medicos, Start: 141, End: 148
Token: bloqueo, Start: 148, End: 155
Token: auriculoventricular, Start: 155, End: 174
Token: de, Start: 174, End: 176
Token: primer, Start: 176, End: 182
Token: grado, Start: 182, End: 187
Token: hipertension, Start: 187, End:

In [74]:
print(processed_text)

[('paciente', 0, 8), ('de', 8, 10), ('8', 10, 11), ('años', 11, 15), ('que', 15, 18), ('ingresa', 18, 25), ('el', 25, 27), ('dia', 27, 30), ('08102019', 30, 38), ('en', 38, 40), ('planta', 40, 46), ('de', 46, 48), ('hospitalizacion', 48, 63), ('de', 63, 65), ('pediatria', 65, 74), ('por', 74, 77), ('sindrome', 77, 85), ('febril', 85, 91), ('antecedents', 91, 102), ('no', 102, 104), ('alergias', 104, 112), ('conocidas', 112, 121), ('calendario', 121, 131), ('vacunal', 131, 138), ('al', 138, 140), ('dia', 140, 143), ('antecedentes', 143, 155), ('patologicos', 155, 166), ('en', 166, 168), ('seguimiento', 168, 179), ('por', 179, 182), ('cardiologia', 182, 193), ('por', 193, 196), ('ligera', 196, 202), ('hipertrofia', 202, 213), ('ventricular', 213, 224), ('en', 224, 226), ('los', 226, 229), ('primeros', 229, 237), ('años', 237, 241), ('de', 241, 243), ('vida', 243, 247), ('estudio', 247, 254), ('cardiologico', 254, 266), ('normal', 266, 272), ('en', 272, 274), ('la', 274, 276), ('actualida

**Pre-processing json object**

- Reading the json object to obtain a list of the Negation and Uncertainty proc words as well as their scopes

- The scopes and annotations will be used for the supervised learning as the 'target'

In [75]:
# Loading the json object
loading = open("negacio_train_v2024.json")
for_object = loading.read()
object = json.loads(for_object)

In [85]:
# Prepare lists to store the results
negations = []
negation_scopes = []
uncertainties = []
uncertainty_scopes = []

# Process the documents and obtain negation and uncertainty annotations
for item in object:
    text_data = item['data']['text']
    for prediction in item['predictions']:
        for result in prediction['result']:
            labels = result['value']['labels']
            start_index = result['value']['start']
            end_index = result['value']['end']
            text_segment = item['data']['text'][start_index:end_index]

            if "NEG" in labels:
                negations.append(text_segment)

                # Find the scope of the negation
                scope_start = -1
                scope_end = -1

                # Search for the scope starting just after the negation
                current_index = prediction['result'].index(result)
                if current_index < len(prediction['result']) - 1:
                    next_result = prediction['result'][current_index + 1]
                    next_start = next_result['value']['start']
                    next_end = next_result['value']['end']
                    next_labels = next_result['value']['labels']
                    if "NSCO" in next_labels and next_start == end_index:
                        scope_start = next_start
                        scope_end = next_end

                current_index = prediction['result'].index(result)
                if current_index > 0:
                    prev_result = prediction['result'][current_index - 1]
                    prev_start = prev_result['value']['start']
                    prev_end = prev_result['value']['end']
                    prev_labels = prev_result['value']['labels']
                    if "NSCO" in prev_labels and prev_end == start_index:
                        scope_start = prev_start
                        scope_end = prev_end

                if scope_start != -1 and scope_end != -1:
                    scope_text = item['data']['text'][scope_start:scope_end]
                    negation_scopes.append((start_index, scope_end))

            if "UNC" in labels:
                uncertainties.append(text_segment)

                # Find the scope of the uncertainty
                scope_start = -1
                scope_end = -1

                # Search for the scope starting just after the uncertainty
                current_index = prediction['result'].index(result)
                if current_index < len(prediction['result']) - 1:
                    next_result = prediction['result'][current_index + 1]
                    next_start = next_result['value']['start']
                    next_end = next_result['value']['end']
                    next_labels = next_result['value']['labels']
                    if "USCO" in next_labels and next_start == end_index:
                        scope_start = next_start
                        scope_end = next_end

                current_index = prediction['result'].index(result)
                if current_index > 0:
                    prev_result = prediction['result'][current_index - 1]
                    prev_start = prev_result['value']['start']
                    prev_end = prev_result['value']['end']
                    prev_labels = prev_result['value']['labels']
                    if "USCO" in prev_labels and prev_end == start_index:
                        scope_start = prev_start
                        scope_end = prev_end

                if scope_start != -1 and scope_end != -1:
                    scope_text = item['data']['text'][scope_start:scope_end]
                    uncertainty_scopes.append((start_index, scope_end))

# Remove duplicates from the annotation lists
negations = list(set(negations))
negation_scopes = list(set(negation_scopes))
uncertainties = list(set(uncertainties))
uncertainty_scopes = list(set(uncertainty_scopes))

# Print the negation/uncertainty annotations and their scopes
print("Negations\n", negations)
print("Negation Scopes\n", negation_scopes)
print("Uncertainties\n", uncertainties)
print("Uncertainty Scopes\n", uncertainty_scopes)

Negations:
 ['atipicos', 'negatiu', 'no', 'inespecifico', ' no ', 'negativas ', 'desorientacion.', 'inespecifico:', 'niega', 'neg;', 'falta de ', 'se desestimo ', 'afebril', 'negativos;', ' negativo', 'negativas.', 'se suspende ', 'tampoco ', 'negativo.', 'desaparicion de ', 'negativo', 'se retira', 'rechaza ', 'negativas', 'sense', ' afebril', 'sin ', 'se retira ', 'ex-', 'descarta ', 'negaitvo', 'descartada ', 'ex fumador ', 'negatividad ', 'negativos', 'negatividad de ', 'negatiu.', 'imposibilidad de ', 'neg', 'irregulares', 'ex ', 'negativa)', 'sense ', 'negativo,', 'arritmicos', 'negativos,', 'negativa', 'negativa ', 'asintomatica,', 'sin', 'desaparecen ', 'asintomatico ', 'indetectable.', 'en ninguna ', 'neg.', 'niegan ', 'neg ', 'ex', 'excepto ', 'negativa.', 'neg,', 'suspendido ', 'asintomatico', 'desaparicion del ', 'negativo)', 'negativos ', 'ceden ', 'asintomatica', 'negativa,', 'afebril.', 'ausencia ', 'impide ', 'inestabilidad', 'inespecifico.', 'retirar ', 'imposibilidad'

**Implementation**

In [77]:

# We can use any of the following methods to detect negation or uncertainty in a sentence

#Pre-defined list of negation trigger words
#Regular expressions
#Part of Speech tagging
#Syntactic parsing

**Results**

In [78]:
# Explore the results

