In [77]:
!pip install langdetect



In [78]:
!pip install pyspellchecker



In [79]:
!pip install nltk



In [80]:
import os

In [81]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import string

In [82]:
import spacy

In [83]:
#os.chdir('C:\\GitHub Repositories\\NLP-Detection-of-Negation-and-Uncertainty-Project-24\\Data')

In [84]:
# Loading the json file
loading = open("negacio_train_v2024.json")
training_data = json.load(loading)

In [None]:
!python -m spacy download es_core_news_sm
!python -m spacy download ca_core_news_sm

In [86]:
# 1 Remove pacient information and redacted entries
def remove_pacient_info(text):
    # Remove lines starting with "nº historia clinica:" and ending with "motiu d'ingres"
    text = re.sub(r'nº historia clinica:.*?motiu d\'ingres', '', text, flags=re.DOTALL)
    # Remove lines starting with "nhc" and ending with "lopd"
    text = re.sub(r'nhc.*?lopd', '', text, flags=re.DOTALL)
    # Remove all asterisks '*'
    text = text.replace('*', '')
    return text


# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(text):
    # Define a translation table to map punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text


# 3 Spell cheking with language detection (Able to be turned ON/OFF)
# Load language models for Spanish and Catalan
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

def spell_check_and_lemmatize(text):
    # Detect the language of the text
    language = detect(text)
    # Initialize spell checker
    spell = SpellChecker(language='es')  # As most of the text is in Spanish

    # Tokenize the text using the appropriate language model
    if language == 'ca':
        doc = nlp_ca(text)
    else:
        doc = nlp_es(text)

    # Correct misspelled words and lemmatize tokens
    corrected_tokens = []
    for token in doc:
        # Check if the token is a punctuation or whitespace
        if not token.is_punct and not token.is_space:
            # Get the corrected version of the token
            corrected_token = spell.correction(token.text)
            # Lemmatize the corrected token
            corrected_token_lemma = token.lemma_ if token.lemma_ != '-PRON-' else corrected_token
            corrected_tokens.append(corrected_token_lemma)
        else:
            corrected_tokens.append(token.text)

    # Join the tokens back into text
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text


# 4 Tokenization with coordinates of the original text for (evaluation)
def tokenize_with_coordinates(text):
    # Tokenize the text while preserving the coordinates
    tokens_with_coordinates = []
    token_start = 0
    for token in re.finditer(r'\S+', text):
        token_text = token.group(0)
        token_end = token_start + len(token_text)
        tokens_with_coordinates.append((token_text, token_start, token_end))
        # Update token start position for the next token
        token_start = token_end
    return tokens_with_coordinates


# Main function to process the text
def pre_process_text(text, remove_punctuation_call=True, spell_check_call=True):

    # 1 Remove pacient information and redacted entries
    preprocessed_text = remove_pacient_info(text)

    # 2 Remove punctuation if specified
    if remove_punctuation_call:
        preprocessed_text = remove_punctuation(preprocessed_text)

    # 3 Spell check and lemmatize if specified
    if spell_check_call:
        preprocessed_text = spell_check_and_lemmatize(preprocessed_text)

    # 4 Tokenize the text with coordinates
    tokens_with_coordinates = tokenize_with_coordinates(preprocessed_text)

    return tokens_with_coordinates

## <span style="color:red; font-size:larger;">**FEATURE EXTRACTION FUNCTIONS**</span>


1. WORD OF THE VOCABULARY

In [87]:
def text_to_numbers(text):
    lut = {}
    numbers = []
    current_id = 1

    for word in text:
        if word not in lut:
            lut[word] = current_id
            current_id += 1
        numbers.append(lut[word])

    return numbers, lut

2. INITIATES WITH CAPITALIZATION

In [88]:
def init_cap(text):
    capitalized_words = []
    for word in text:
        if word.istitle():
            capitalized_words.append(1)
        else:
            capitalized_words.append(0)
    return capitalized_words

CONTAINS CAPITALIZATION

In [89]:
def contains_cap(text):
    contain_cap_words = []
    for word in text:
        word = word.split()
        if any(char.isupper() for char in word):
            contain_cap_words.append(1)
        else:
            contain_cap_words.append(0)
    return contain_cap_words

3. IS A NUMBER

In [90]:
def is_a_number(text):
    is_num = []
    for word in text:
        try:
            float(word)
            is_num.append(1)
        except ValueError:
            is_num.append(0)
    return is_num

4. CONTAINS A NUMBER

In [91]:
def contains_number(text):
    contains_num = []
    for word in text:
        if any(char.isdigit() for char in word):
            contains_num.append(1)
        else:
            contains_num.append(0)
    return contains_num

HAS DASH (-)

In [92]:
def contains_dash(text):
    contain_dash_words = []
    for word in text:
        word= word.split()
        if '-' in word:
            contain_dash_words.append(1)
        else:
            contain_dash_words.append(0)
    return contain_dash_words

In [93]:
def contains_underscore(text):
    contain_dash_words = []
    for word in text:
        word = word.split()
        if '_' in word:
            contain_dash_words.append(1)
        else:
            contain_dash_words.append(0)
    return contain_dash_words

In [94]:
def contains_punctuation(text):
    contain_punct_words = []
    for word in text:
        if any(char in string.punctuation for char in word):
            contain_punct_words.append(1)
        else:
            contain_punct_words.append(0)
    return contain_punct_words

In [95]:
import nltk
nltk.download('averaged_perceptron_tagger')

def pos_tag_words(text):
    tagged_words = nltk.pos_tag(text)
    return tagged_words

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [96]:
vocabulary_words = []

for entry in training_data:

    text = entry["data"]["text"]  # Extract the text from the JSON object

    processed_text = pre_process_text(text, remove_punctuation_call=True, spell_check_call=False)

    for token, start, end in processed_text:

        vocabulary_words.append(token)



feature1_raw,_ = text_to_numbers (vocabulary_words)
print(feature1_raw)

feature2 = init_cap(vocabulary_words)
print(feature2)

feature3= contains_cap(vocabulary_words)
print(feature3)

feature4 = is_a_number(vocabulary_words)
print(feature4)

feature5 = contains_number(vocabulary_words)
print(feature5)

feature6 = contains_dash(vocabulary_words)
print(feature6)

feature7 = contains_underscore(vocabulary_words)
print(feature7)

tagged_words = pos_tag_words(vocabulary_words)
pos_tags = [tag for word, tag in tagged_words]
feature8,_ = text_to_numbers(pos_tags)
print(feature8)

[1, 2, 3, 4, 5, 6, 7, 8, 4, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 4, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 20, 34, 35, 4, 36, 37, 38, 39, 40, 41, 42, 4, 43, 44, 45, 46, 47, 48, 49, 4, 50, 2, 13, 51, 4, 52, 4, 53, 54, 55, 56, 57, 2, 58, 59, 7, 36, 60, 61, 55, 62, 63, 4, 64, 55, 65, 66, 67, 68, 69, 70, 4, 71, 72, 73, 55, 74, 75, 4, 76, 2, 77, 78, 79, 4, 63, 13, 80, 81, 64, 82, 83, 84, 2, 55, 85, 86, 10, 87, 88, 89, 90, 91, 92, 78, 93, 90, 94, 95, 64, 96, 38, 97, 63, 98, 13, 80, 4, 64, 99, 15, 82, 100, 55, 101, 102, 4, 103, 104, 13, 105, 105, 4, 106, 107, 108, 109, 4, 103, 4, 110, 111, 15, 38, 112, 69, 113, 4, 114, 68, 115, 116, 117, 72, 118, 119, 4, 120, 78, 93, 121, 94, 95, 64, 122, 123, 72, 124, 63, 98, 125, 78, 79, 4, 126, 97, 63, 4, 64, 96, 127, 128, 129, 130, 131, 68, 105, 132, 133, 68, 69, 134, 135, 136, 116, 137, 138, 139, 140, 141, 142, 13, 80, 4, 64, 99, 143, 78, 64, 55, 144, 145, 63, 2, 17, 146, 69, 102, 4, 95, 147, 148, 149, 69, 150, 4, 151, 4, 152, 38, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1, 1, 1, 2, 3, 3, 3, 1, 2, 4, 4, 5, 6, 7, 1, 1, 1, 7, 1, 1, 8, 4, 2, 4, 3, 3, 3, 1, 4, 1, 1, 4, 4, 1, 1, 8, 4, 1, 2, 5, 5, 6, 4, 4, 1, 1, 2, 1, 1, 9, 4, 5, 4, 1, 2, 10, 1, 7, 1, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 1, 5, 5, 6, 4, 4, 1, 2, 4, 4, 1, 5, 6, 2, 3, 3, 3, 3, 1, 3, 3, 4, 1, 2, 1, 1, 1, 3, 3, 3, 3, 7, 4, 1, 4, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 1, 5, 10, 1, 4, 1, 1, 4, 4, 4, 1, 1, 1, 8, 7, 1, 2, 4, 5, 6, 1, 1, 1, 2, 1, 2, 1, 6, 7, 4, 1, 2, 1, 1, 10, 1, 3, 1, 3, 3, 3, 3, 1, 1, 4, 1, 2, 3, 3, 3, 4, 1, 1, 3, 1, 3, 1, 3, 4, 4, 1, 4, 4, 1, 1, 1, 5, 6, 5, 6, 4, 3, 3, 3, 4, 1, 2, 4, 4, 5, 6, 3, 4, 4, 1, 1, 1, 1, 2, 3, 4, 3, 4, 4, 1, 1, 4, 10, 1, 2, 7, 1, 2, 4, 5, 6, 4, 4, 1, 1, 1, 1, 1, 7, 4, 1, 1, 2, 4, 1, 1, 1, 9, 10, 3, 3, 2, 10, 1, 11, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 4, 1, 1, 1, 5, 6, 2, 1, 1, 3, 8, 7, 1, 3, 3, 1, 3, 1, 2, 3, 4, 1, 1, 3, 1, 3, 1, 3, 4, 1, 1, 4, 1, 1, 1, 1, 8, 1, 1, 1, 2, 3, 3, 1, 1, 4, 5, 6, 4, 4, 1, 1, 1, 1, 1, 1, 8, 7, 4, 4, 1, 1, 4, 1, 9, 4, 1, 5, 6, 1, 2, 3, 4, 1, 1,

In [None]:
# Prepare lists to store the tagged texts
tagged_texts = []

# Process the documents and perform BIO tagging for negation cues
for item in object:
    text_data = item['data']['text'].split()  # Split text into words
    tagged_text = ['O'] * len(text_data)  # Initialize with 'O' tags for all words

    for prediction in item['predictions']:
        for result in prediction['result']:
            labels = result['value']['labels']
            #print(labels)
            if "NEG" in labels:
                start_index = result['value']['start']
                end_index = result['value']['end']

                # Find the words corresponding to the start and end indices of the negation cue
                #start_word_index = len(" ".join(text_data[:start_index]).encode().split())  # Calculate the start word index
                #nd_word_index = len(" ".join(text_data[:end_index]).encode().split())  # Calculate the end word index

                # Assign BIO tags to words inside the negation cue scope
                for i in range(start_index, min(end_index, len(tagged_text))):  # Ensure the end index doesn't exceed the length of tagged_text
                    if i == start_index:
                        tagged_text[i] = 'B'  # 'B' tag for beginning of negation cue
                    else:
                        tagged_text[i] = 'I'  # 'I' tag for inside the negation cue

    # Append the tagged text to the list
    tagged_texts.append(tagged_text)

# Print or store the tagged texts
for idx, (text_data, tagged_text) in enumerate(zip(object, tagged_texts)):
    #print("Document", idx + 1)
    for word, tag in zip(text_data['data']['text'].split(), tagged_text):
        print(word, tag)
