In [2]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=4fd26c5576c71c1f1bfa084b37cf97a27b5acd1ec67ce58f65861f61ca2a4a31
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [3]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [4]:
!pip install nltk



In [5]:
import os

In [6]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import string

In [7]:
import spacy

In [8]:
#os.chdir('C:\\GitHub Repositories\\NLP-Detection-of-Negation-and-Uncertainty-Project-24\\Data')

In [11]:
# Loading the json file
loading = open("negacio_train_v2024.json")
training_data = json.load(loading)

In [12]:
!python -m spacy download es_core_news_sm
!python -m spacy download ca_core_news_sm

Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting ca-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.7.0/ca_core_news_sm-3.7.0-py3-none-any.whl (19.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19

In [48]:
# 1 Remove pacient information and redacted entries
def remove_pacient_info(text):
    # Remove lines starting with "nº historia clinica:" and ending with "motiu d'ingres"
    text = re.sub(r'nº historia clinica:.*?motiu d\'ingres', '', text, flags=re.DOTALL)
    # Remove lines starting with "nhc" and ending with "lopd"
    text = re.sub(r'nhc.*?lopd', '', text, flags=re.DOTALL)
    # Remove all asterisks '*'
    text = text.replace('*', '')
    return text


# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(text):
    # Define a translation table to map punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text


# 3 Spell cheking with language detection (Able to be turned ON/OFF)
# Load language models for Spanish and Catalan
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

def lemmatize_text(text):
    # Detect the language of the text
    language = detect(text)

    # Tokenize the text using the appropriate language model
    if language == 'ca':
        doc = nlp_ca(text)
    else:
        doc = nlp_es(text)

    # Lemmatize tokens
    lemmatized_tokens = []
    for token in doc:
        # Check if the token is a punctuation or whitespace
        if not token.is_punct and not token.is_space:
            # Lemmatize the token
            lemmatized_token = token.lemma_ if token.lemma_ != '-PRON-' else token.text
            lemmatized_tokens.append(lemmatized_token)
        else:
            lemmatized_tokens.append(token.text)

    # Join the tokens back into text
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text


# 4 Tokenization with coordinates of the original text for (evaluation)
def tokenize_with_coordinates(text):
    # Tokenize the text while preserving the coordinates
    tokens_with_coordinates = []
    token_start = 0
    for token in re.finditer(r'\S+', text):
        token_text = token.group(0)
        token_end = token_start + len(token_text)
        tokens_with_coordinates.append((token_text, token_start, token_end))
        # Update token start position for the next token
        token_start = token_end
    return tokens_with_coordinates


# Main function to process the text
def pre_process_text(text, remove_punctuation_call=True, spell_check_call=True):

    # 1 Remove pacient information and redacted entries
    preprocessed_text = remove_pacient_info(text)

    # 2 Remove punctuation if specified
    if remove_punctuation_call:
        preprocessed_text = remove_punctuation(preprocessed_text)

    # 3 Spell check and lemmatize if specified
    if spell_check_call:
        preprocessed_text = lemmatize_text(preprocessed_text)

    # 4 Tokenize the text with coordinates
    tokens_with_coordinates = tokenize_with_coordinates(preprocessed_text)

    return tokens_with_coordinates

## <span style="color:red; font-size:larger;">**FEATURE EXTRACTION FUNCTIONS**</span>


1. WORD OF THE VOCABULARY

In [14]:
def text_to_numbers(text):
    lut = {}
    numbers = []
    current_id = 1

    for word in text:
        if word not in lut:
            lut[word] = current_id
            current_id += 1
        numbers.append(lut[word])

    return numbers, lut

2. INITIATES WITH CAPITALIZATION

In [15]:
def init_cap(text):
    capitalized_words = []
    for word in text:
        if word.istitle():
            capitalized_words.append(1)
        else:
            capitalized_words.append(0)
    return capitalized_words

CONTAINS CAPITALIZATION

In [16]:
def contains_cap(text):
    contain_cap_words = []
    for word in text:
        word = word.split()
        if any(char.isupper() for char in word):
            contain_cap_words.append(1)
        else:
            contain_cap_words.append(0)
    return contain_cap_words

3. IS A NUMBER

In [17]:
def is_a_number(text):
    is_num = []
    for word in text:
        try:
            float(word)
            is_num.append(1)
        except ValueError:
            is_num.append(0)
    return is_num

4. CONTAINS A NUMBER

In [18]:
def contains_number(text):
    contains_num = []
    for word in text:
        if any(char.isdigit() for char in word):
            contains_num.append(1)
        else:
            contains_num.append(0)
    return contains_num

HAS DASH (-)

In [19]:
def contains_dash(text):
    contain_dash_words = []
    for word in text:
        word= word.split()
        if '-' in word:
            contain_dash_words.append(1)
        else:
            contain_dash_words.append(0)
    return contain_dash_words

In [20]:
def contains_underscore(text):
    contain_dash_words = []
    for word in text:
        word = word.split()
        if '_' in word:
            contain_dash_words.append(1)
        else:
            contain_dash_words.append(0)
    return contain_dash_words

In [21]:
def contains_punctuation(text):
    contain_punct_words = []
    for word in text:
        if any(char in string.punctuation for char in word):
            contain_punct_words.append(1)
        else:
            contain_punct_words.append(0)
    return contain_punct_words

In [55]:
def pos_tag_words(text):

    chunk_size = 10000

    tagged_words = []

    for i in range(0, len(text), chunk_size):
        chunk = ' '.join(text[i:i+chunk_size])
        # Tokenize the chunk
        doc = nlp_es(chunk)
        # Extract POS tags
        tagged_words.extend([(token.text, token.pos_) for token in doc])
    return tagged_words

In [34]:
def beforepos(pos_tags):
    features = []
    # Iterate over each word and its corresponding POS tag
    for i, (word, pos_tag) in enumerate(zip(vocabulary_words, pos_tags)):
        # Extract POS tags of the previous 6 words (or less if near the beginning)
        prev_pos_tags = pos_tags[max(0, i - 6):i]
        # Pad with empty strings if less than 6 previous words
        prev_pos_tags = [''] * (6 - len(prev_pos_tags)) + prev_pos_tags
        # Add the feature for the current word
        features.append((word, pos_tag, prev_pos_tags))

    return features

In [70]:
def afterpos(pos_tags):
    features = []
    # Iterate over each word and its corresponding POS tag
    for i, (word, pos_tag) in enumerate(zip(vocabulary_words, pos_tags)):
        # Extract POS tags of the following word
        next_pos_tag = pos_tags[i+1]
        # Add the feature for the current word
        features.append((vocabulary_words[i], next_pos_tag))

    return features

In [131]:
def beforegram (text):

    gram = []

    for i, word in enumerate(text):

        prev_pos_tags = text[max(0, i - 7):i]

        features = []

        for j in range(len(prev_pos_tags) - 1):

            features.append(prev_pos_tags[j:j+2])

        gram.append((word, features))

    return gram

In [147]:
def aftergram (text):

    gram = []
    for i, word in enumerate(text):
        features = []
        features.append(text[i:i+2])

        gram.append((word, features))

    return gram

In [128]:
vocabulary_words = []

for entry in training_data:

    text = entry["data"]["text"]  # Extract the text from the JSON object

    processed_text = pre_process_text(text, remove_punctuation_call=True, spell_check_call=False)

    for token, start, end in processed_text:

        vocabulary_words.append(token)



feature1_raw,_ = text_to_numbers (vocabulary_words)
#print(feature1_raw)

feature2 = init_cap(vocabulary_words)
#print(feature2)

feature3= contains_cap(vocabulary_words)
#print(feature3)

feature4 = is_a_number(vocabulary_words)
#print(feature4)

feature5 = contains_number(vocabulary_words)
#print(feature5)

feature6 = contains_dash(vocabulary_words)
#print(feature6)

feature7 = contains_underscore(vocabulary_words)
#print(feature7)

tagged_words = pos_tag_words(vocabulary_words)
pos_tags = [tag for word, tag in tagged_words]
feature8,_ = text_to_numbers(pos_tags)
#feature 8

before_pos = beforepos(pos_tags)
print((before_pos[:10]))
#feature 9

after_pos = afterpos(pos_tags)
print((after_pos[:30]))
#feature 10

before_gram = beforegram (vocabulary_words)
print(before_gram[:20])

after_gram = aftergram (vocabulary_words)
print(after_gram[:20])

[1, 2, 3, 4, 1, 5, 4, 6, 4, 1, 5, 1, 3, 4, 1, 7, 3, 8, 1, 5, 1, 5, 1, 5, 4, 5, 1, 1, 5, 1, 5, 1, 5, 5, 3, 1, 5, 1, 4, 1, 5, 4, 1, 5, 1, 5, 4, 1, 6, 6, 5, 1, 5, 3, 4, 9, 10, 4, 6, 4, 1, 4, 1, 5, 2, 3, 1, 2, 11, 5, 4, 1, 5, 7, 2, 3, 1, 4, 1, 2, 3, 1, 5, 4, 12, 1, 4, 1, 4, 1, 2, 3, 1, 4, 1, 2, 3, 12, 1, 4, 1, 4, 1, 6, 5, 1, 4, 2, 2, 2, 3, 1, 5, 1, 5, 1, 5, 1, 9, 12, 1, 5, 3, 12, 1, 5, 4, 9, 1, 5, 4, 1, 4, 1, 5, 7, 3, 10, 2, 3, 1, 4, 1, 5, 4, 6, 6, 4, 5, 1, 9, 9, 4, 1, 4, 1, 5, 7, 4, 1, 12, 5, 4, 2, 4, 1, 5, 5, 4, 3, 1, 4, 6, 12, 6, 5, 3, 12, 5, 1, 5, 4, 5, 1, 5, 3, 12, 1, 4, 12, 9, 1, 4, 1, 5, 1, 8, 5, 1, 5, 4, 6, 6, 10, 4, 12, 1, 4, 1, 5, 5, 5, 6, 9, 5, 6, 4, 1, 4, 1, 5, 3, 12, 5, 2, 3, 5, 1, 2, 8, 3, 12, 1, 4, 12, 1, 5, 3, 12, 9, 4, 1, 4, 9, 4, 12, 1, 5, 4, 1, 7, 4, 6, 5, 2, 3, 1, 5, 4, 1, 4, 12, 1, 12, 1, 11, 3, 4, 12, 1, 4, 1, 11, 5, 4, 5, 5, 5, 8, 2, 3, 4, 1, 1, 5, 5, 8, 5, 7, 4, 5, 1, 4, 1, 5, 5, 4, 12, 1, 4, 12, 1, 5, 5, 2, 3, 12, 1, 3, 5, 1, 1, 5, 4, 1, 5, 3, 12, 5, 6, 2, 3, 5, 5,