In [1]:
# Libraries
import os
import json
import re
!pip install sklearn_crfsuite
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import string
import spacy
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np



In [42]:
#os.chdir('C:\\GitHub Repositories\\NLP-Detection-of-Negation-and-Uncertainty-Project-24\\Data')

In [2]:
# Loading the json file
loading = open("negacio_train_v2024.json")
training_data = json.load(loading)
print(len(training_data))

254


In [3]:
!python -m spacy download es_core_news_sm
!python -m spacy download ca_core_news_sm

Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
     ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
      --------------------------------------- 0.2/12.9 MB 2.1 MB/s eta 0:00:07
     -- ------------------------------------- 0.7/12.9 MB 5.2 MB/s eta 0:00:03
     --- ------------------------------------ 1.2/12.9 MB 7.6 MB/s eta 0:00:02
     ----- ---------------------------------- 1.8/12.9 MB 9.0 MB/s eta 0:00:02
     ------- -------------------------------- 2.5/12.9 MB 10.8 MB/s eta 0:00:01
     ---------- ----------------------------- 3.3/12.9 MB 11.7 MB/s eta 0:00:01
     ------------ --------------------------- 4.1/12.9 MB 12.5 MB/s eta 0:00:01
     --------------- ------------------------ 4.9/12.9 MB 13.0 MB/s eta 0:00:01
     ----------------- ------------------

In [4]:
# 1 Remove pacient information and redacted entries
def remove_pacient_info(text):
    # Remove lines starting with "nº historia clinica:" and ending with "motiu d'ingres"
    text = re.sub(r'nº historia clinica:.*?motiu d\'ingres', '', text, flags=re.DOTALL)
    # Remove lines starting with "nhc" and ending with "lopd"
    text = re.sub(r'nhc.*?lopd', '', text, flags=re.DOTALL)
    # Remove all asterisks '*'
    text = text.replace('*', '')
    return text


# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(text):
    # Define a translation table to map punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text


# 3 Spell cheking with language detection (Able to be turned ON/OFF)
# Load language models for Spanish and Catalan
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

def lemmatize_text(text):
    # Detect the language of the text
    language = detect(text)

    # Tokenize the text using the appropriate language model
    if language == 'ca':
        doc = nlp_ca(text)
    else:
        doc = nlp_es(text)

    # Lemmatize tokens
    lemmatized_tokens = []
    for token in doc:
        # Check if the token is a punctuation or whitespace
        if not token.is_punct and not token.is_space:
            # Lemmatize the token
            lemmatized_token = token.lemma_ if token.lemma_ != '-PRON-' else token.text
            lemmatized_tokens.append(lemmatized_token)
        else:
            lemmatized_tokens.append(token.text)

    # Join the tokens back into text
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text


# 4 Tokenization with coordinates of the original text for (evaluation)
def tokenize_with_coordinates(text):
    # Tokenize the text while preserving the coordinates
    tokens_with_coordinates = []
    token_start = 0
    for token in re.finditer(r'\S+', text):
        token_text = token.group(0)
        token_end = token_start + len(token_text)
        tokens_with_coordinates.append((token_text, token_start, token_end))
        # Update token start position for the next token
        token_start = token_end
    return tokens_with_coordinates


# Main function to process the text
def pre_process_text(text, remove_punctuation_call=True, spell_check_call=True):

    # 1 Remove pacient information and redacted entries
    preprocessed_text = remove_pacient_info(text)

    # 2 Remove punctuation if specified
    if remove_punctuation_call:
        preprocessed_text = remove_punctuation(preprocessed_text)

    # 3 Spell check and lemmatize if specified
    if spell_check_call:
        preprocessed_text = lemmatize_text(preprocessed_text)

    # 4 Tokenize the text with coordinates
    tokens_with_coordinates = tokenize_with_coordinates(preprocessed_text)

    return tokens_with_coordinates

## <span style="color:red; font-size:larger;">**DATA ANNOTATION**</span>


In [5]:
#EXTRACTED FROM THE TRAINING SET
neg_pre = ['impide', 'exfumador', 'negativo', 'desaparecen', 'asintomatica', 'afebril', 'se desestimo', 'negativos', 'ninguno', 'asintomatico', 'desorientado', 'inestabilidad', 'atipicos', 'ausencia de', 'ceden', 'negativa', 'negativas', 'excepto', 'desorientacion', 'inespecificos', 'se suspende','inespecifico', 'ex', 'arritmicos', 'cede', 'se retira', 'ex fumador', 'niegan', 'negatiu', 'negaitvo', 'indetectable',' negativo', 'suspendido']
neg_pos = ['retirar','ni' 'desaparicion de', 'descarta', 'ausencia', 'descartada', 'niega', 'nega', 'rechaza', 'desaparicion del', 'imposibilidad', 'retiro', 'irregulares', 'negatividad', 'tampoco', 'sin', 'imposibilidad de', 'en ninguna', 'incapacidad para', ' no', 'neg', ' afebril', 'sense', 'falta de', 'negatividad de', 'negatividad del', 'no']
#EXTRACTED FROM THE GITHUB
neg_pre_filtered= ['gobierna al paciente', 'ninguna otra evidencia', 'la van descartar per', 'no aparece', 'descartaron al paciente por', 'adecuado para descartarla', 'excluir', 'excloure', 'descartar al pacient per', 'sin ninguna evidencia de', 'descartarlo por', 'no tenía', 'lo descartó', 'sin signo de', 'sense indicació de', 'pot descartar', 'excluye', 'ho descarta', 'ninguna evidencia radiográfica de', 'descartarlo', 'negando', 'sense troballes de', 'adequat per a descartar-lo', 'cap senyal de', 'no em queixo de', 'ninguna señal de', 'nunca tuve', 'ho va descartar', 'descartar', 'lo descartaron por', 'la va descartar contra', 'pot descartar-ho', 'lo descartaron en contra', 'libre de', 'pot descartar al pacient', 'no tinc', 'absència de', 'sin quejas de', 'van descartar contra', 'la va descartar', 'no tenia', 'descartado contra', 'ho va descartar contra', 'no me quejo de', 'puede descartar', 'no significativo', 'resuelto', 'puede descartar al paciente', 'no sospitós', 'gens especial per a', 'pot descartar-ho per', 'exclou', 'r / o', 'resolt', 'van descartar per a', 'descartó', 'va descartar', 'puede descartarla contra', 'sin evidencia', 'descartar per a', 'no apareix', 'pot descartar-la per', 'la descartó', 'va descartar al pacient', 'descartar-ho', 'cap altra evidència', 'no saber de', 'ninguna nueva evidencia', 'sin indicación de', 'mai desenvolupat', 'sense queixes de', "l'exclou", 'lo descarta', 'cap suggeriment de', 'cap evidència radiogràfica de', 'sense cap evidència de', 'puede descartarlo', 'pacient no era', 'no apreciar', 'con ningún', 'cap causa de', 'adecuado para descartarlo', 'no associat amb', 'descartarla', 'nunca desarrollado', 'adecuado para descartarla por', 'no poden veure', 'ro', 'ninguna evidencia para sugerir', 'sin hallazgos de', 'no pueden ver', 'pot descartar-la', 'la excluye', 'governa al pacient', 'amb cap', 'descartar al paciente por', 'expulsó al paciente por', 'no anormal', 'no sospechoso', 'pot descartar-la contra', 'nada nuevo', 'paciente no era', 'descartaron para', 'no poder', 'evaluar por', 'suficiente para descartarlo por', 'no exhibir', 'més aviat que', 'fer una prova per', 'puede descartarla por', 'ninguna causa de', 'lliure de', 'mai vaig tenir', 'no tengo', 'descartaron contra', 'puede descartarla', 'adecuado para descartarlo por', 'descartat contra', 'descartar-la', 'adequat per a descartar-la', 'no significatiu', 'negatiu per a', 'sense evidència', 'ho van descartar per', 'descartar al pacient', 'cap nova evidència', 'descartarla por', 'nada especial para', 'ninguna sugerencia de', 'sense signe de', 'no sentir', 'descartar al paciente', 'descartar-ho per', 'puede descartarlo en contra', 'puede descartarlo por', 'más bien que', 'res nou', 'descartó al paciente', 'negant', 'avaluar per', 'no demostrar', 'descartar para', 'no revela', 'no revelar', 'descartó al paciente contra', 'puede descartar contra', 'descartar-la per', 'suficiente para descartar', 'negativo para', 'la descartó contra', 'revisado para', 'suficient per a descartar', 'revisat per a', 'hacer una prueba por', 'pot descartar-ho en contra', 'pot descartar contra', 'la descartaron por', 'descartaron al paciente contra', 'no asociado con', 'suficiente para descartarla por', 'lo descartó contra']
neg_pos_filtered= ['libre', 'podría ser descartado', 'fue descartado', 'rechazado', 'ha de ser descartat', 'lliure', 'rebutjat', 'puede ser descartado', 'ser descartat', 'adecuado para descartar', 'están descartadas', 'podria ser descartat', 'improbable', 'podría ser descartado por', 'debe ser descartado por', 'suficient per a descartar-ho', 'serà descartat per', 'podria descartar-se', 'suficient per a descartar-la', 'va ser descartat', 'han estat descartades', 'ser descartado por', 'declina', 'adequat per a descartar', 'siendo descartado', 'suficiente para descartarla', 'no ver', 'sent descartat', 'està descartat', 'se puede descartar por', 'pot ser descartat', 'ha estat descartat', 'puede ser descartado para', 'ha de descartar-se', 'suficiente para descartarlo', 'es descarta', 'ha sido descartado', 'será descartado', 'ser descartado', 'no veure', 'ser descartat per', 'debe ser descartado', 'serà descartat', 'debe descartarse para', 'debe ser descartado para', 'es pot descartar per', 'negado', 'se descarta', 'lo descartaron', 'no ser', 'se puede descartar', 'está descartado', 'negat', 'podría descartarse', 'es pot descartar', 'ho van descartar', 'podria ser descartat per', 'han sido descartadas', 'será descartado por', 'estan descartades', 'debe descartarse']
neg_pre.extend(neg_pre_filtered)
neg_pos.extend(neg_pos_filtered)

#NOSE COM EMFOCAR: SIN, NO, DESCARTAR
unc_pre = [ 'al parecer' , 'vs', 'dudosamente', 'indeterminado', 'sospecha', 'pudieran', 'aparentes', 'dubtos','permite descartar', 'parece', 'atribuida', 'clara', 'no clara', 'desconocido']
unc_pos = ['compatible amb', 'desconoce', 'indiquen', 'sin aparente', 'sugieren', 'ssospechosas de', 'probablemente', 'posible', 'sugestivo de', 'falsa', 'sospechan de', 'posibilidad de', 'sugiriendo', 'orienta', 'sospechosos de', 'sugestivos de', 'se orienta', 'plantea', 'podria', 'puede', 'podrian', 'probables', 'no', 'sugiere', 'parecen', 'sin', 'sospechosa de', 'sugestivas de', 'orientan como', 'dudosa', 'interpreta', 'compatible con', 'valorar', 'dudosos', 'probable', 'poco porque', 'sugieran', 'sin clara', 'no permite descartar', 'se desconoce', 'impresiona de', 'sugestiva de', 'orienta como', 'orientan', 'sin poder descartar', 'no parece', 'sospitosa de', 'sugestivos con', 'impresiona', 'aparentemente', 'sospecha de', 'no es posible descartar', 'compatibles con', 'compatible', 'aparente', 'sugestiva como', 'posiblemente', 'posibles', 'sugiera de', 'descartar', 'dudoso', 'se orientan', 'sospechosas de', 'sin aparentes', 'sin claras']
#DEMA HO AFEGEIXO
unc_pre_filtered = ['como una causa secundaria para', 'como el origen secundario de', 'secundario a', 'como la causa de', 'como la causa secundaria de', 'como una razón de', 'como una causa secundaria de', 'como una etiología secundaria para', 'excepto', 'aunque', 'como la fuente secundaria para', 'como una etilogía para', 'como la fuente de', 'como el origen secundario para', 'com la font secundària de', 'como una razón secundaria para', 'com una raó secundària per a', 'com la font secundària per a', 'com una etiologia de', 'como una etiología secundaria de', 'secundari a', 'encara que', 'como una razón secundaria de', 'como la razón secundaria de', 'como la fuente secundaria de', 'com la causa secundària de', 'a pesar que', 'como una razón para', 'como la etilogía de', "com l'etiologia de", 'como la razón secundaria para', 'como un origen secundario para', 'com una font secundària per a', 'como una etilogía de', 'como fuente de', "com l'etiologia secundària per a", "com l'origen secundari per a", "com l'origen secundari de", 'però', 'com a causa de', 'com una etiologia secundària per a', 'como la razón de', 'com un origen secundari per a', 'com una raó secundària de', 'no obstant això', 'com una font secundària de', 'com una raó per a', 'pero', 'com la raó secundària de', 'como la etilogía secundaria para', 'a pesar de que', 'sin embargo', "com l'etiologia secundària de", 'com la causa de', 'com una causa secundària per a', 'com la raó de', 'a part de', 'como una fuente secundaria de', 'com una causa secundària de', 'aparte de', 'com a font de', 'como una fuente secundaria para', "com l'origen de", 'com una etiologia per a', 'com la raó secundària per a', 'como el origen de', 'com un origen secundari de', 'com una raó de', 'como un origen secundario de', 'excepte', 'como causa de', 'encara', 'todavía', 'com una etiologia secundària de', 'como la etilogía secundaria de']
unc_post_filtered =['origens de', 'raons de', 'altres possibilitats de', 'font per a', 'desencadenar evento para', 'origen para', 'raons per a', 'fuente para', 'causes de', 'fuentes de', 'causa de', 'etilogia de', 'etilogía para', 'origen per a', 'motivo de', 'fuentes para', 'otras posibilidades de', 'razones de', 'causas de', 'font de', 'razones para', 'etilogía de', 'fonts per a', 'fonts de', 'raó per a', 'razón de', 'orígenes para', 'razón para', 'motiu de', 'desencadenar esdeveniment per a', 'fuente de', 'orígenes de', 'origen de', 'raó de', 'etilogia per a', 'origens per a']
unc_pre.extend(unc_pre_filtered)
unc_pos.extend(unc_post_filtered)   

In [6]:
def annotate_text(text):
    # Define negation and uncertainty cues
    # Tokenize the text
    tokens = text.split()

    # Initialize labels list
    labels = []

    # Iterate through tokens and annotate
    for token in tokens:
        # Check if token is a negation cue
        if token.lower() in neg_pre:
            labels.append("NEG-PRE")
        elif token.lower() in neg_pos:
            labels.append("NEG-POS")
        
        # Check if token is an uncertainty cue
        elif token.lower() in unc_pre:
            labels.append("UNC-PRE")
        elif token.lower() in unc_pos:
            labels.append("UNC-POS")

        else:
            labels.append("O")  # Not negation or uncertainty

    return labels

## <span style="color:red; font-size:larger;">**FEATURE EXTRACTION FUNCTIONS**</span>


1. WORD OF THE VOCABULARY

In [7]:
def text_to_numbers(text):
    lut = {}
    numbers = []
    current_id = 1

    for word in text:
        if word not in lut:
            lut[word] = current_id
            current_id += 1
        numbers.append(lut[word])

    return numbers, lut

2. INITIATES WITH CAPITALIZATION

In [8]:
def init_cap(text):
    capitalized_words = []
    for word in text:
        if word.istitle():
            capitalized_words.append(1)
        else:
            capitalized_words.append(0)
    return capitalized_words

CONTAINS CAPITALIZATION

In [9]:
def contains_cap(text):
    contain_cap_words = []
    for word in text:
        word = word.split()
        if any(char.isupper() for char in word):
            contain_cap_words.append(1)
        else:
            contain_cap_words.append(0)
    return contain_cap_words

3. IS A NUMBER

In [10]:
def is_a_number(text):
    is_num = []
    for word in text:
        try:
            float(word)
            is_num.append(1)
        except ValueError:
            is_num.append(0)
    return is_num

4. CONTAINS A NUMBER

In [11]:
def contains_number(text):
    contains_num = []
    for word in text:
        if any(char.isdigit() for char in word):
            contains_num.append(1)
        else:
            contains_num.append(0)
    return contains_num

HAS DASH (-)

In [12]:
def contains_dash(text):
    contain_dash_words = []
    for word in text:
        word= word.split()
        if '-' in word:
            contain_dash_words.append(1)
        else:
            contain_dash_words.append(0)
    return contain_dash_words

In [13]:
def contains_underscore(text):
    contain_dash_words = []
    for word in text:
        word = word.split()
        if '_' in word:
            contain_dash_words.append(1)
        else:
            contain_dash_words.append(0)
    return contain_dash_words

In [14]:
def contains_punctuation(text):
    contain_punct_words = []
    for word in text:
        if any(char in string.punctuation for char in word):
            contain_punct_words.append(1)
        else:
            contain_punct_words.append(0)
    return contain_punct_words

In [15]:
def pos_tag_words(text):

    chunk_size = 10000

    tagged_words = []

    for i in range(0, len(text), chunk_size):
        chunk = ' '.join(text[i:i+chunk_size])
        # Tokenize the chunk
        doc = nlp_es(chunk)
        # Extract POS tags
        tagged_words.extend([(token.text, token.pos_) for token in doc])
    return tagged_words

In [16]:
def beforepos(pos_tags, vocabulary_words):
    features = []
    # Iterate over each word and its corresponding POS tag
    for i, (word, pos_tag) in enumerate(zip(vocabulary_words, pos_tags)):
        # Extract POS tags of the previous 6 words (or less if near the beginning)
        prev_pos_tags = pos_tags[max(0, i - 6):i]
        # Pad with empty strings if less than 6 previous words
        prev_pos_tags = [''] * (6 - len(prev_pos_tags)) + prev_pos_tags
        # Add the feature for the current word
        features.append((word, pos_tag, prev_pos_tags))

    return features

In [17]:
def afterpos(pos_tags, vocabulary_words):
    features = []
    # Iterate over each word and its corresponding POS tag
    for i, (word, pos_tag) in enumerate(zip(vocabulary_words, pos_tags)):
        # If it's not the last word
        if i < len(pos_tags) - 1:
            # Extract POS tags of the following word
            next_pos_tag = pos_tags[i + 1]
            # Add the feature for the current word
            features.append((vocabulary_words[i], next_pos_tag))
        else:
            # If it's the last word, add a placeholder feature
            features.append((vocabulary_words[i], 'END'))  # Placeholder for end of text
    return features


In [59]:
def beforegram (text):

    gram = []

    for i, word in enumerate(text):

        prev_pos_tags = text[max(0, i - 7):i]

        features = []

        for j in range(len(prev_pos_tags) - 1):

            features.append(prev_pos_tags[j:j+2])

        gram.append((word, features))

    return gram

In [18]:
def aftergram (text):

    gram = []
    for i, word in enumerate(text):
        features = []
        features.append(text[i:i+2])

        gram.append((word, features))

    return gram

In [20]:
possible_suf = ['al', 'ar', 'ble', 'ción', 'dad', 'do', 'er', 'ez', 'ico', 'il', 'io', 'ir', 'ita', 'l', 'o', 'or', 'oso', 'oso', 're', 'to', 'za', 'able', 'ació', 'ada', 'al', 'ar', 'at', 'cia', 'dad', 'dor', 'eza', 'ico', 'idad', 'idad', 'iento', 'il', 'ismo', 'ista', 'izar', 'mento', 'or', 'oso', 'ote', 'ura', 'ació', 'ada', 'al', 'ar', 'at', 'ència', 'enca', 'er', 'eria', 'esc', 'i', 'isme', 'ista', 'ment', 'or', 'òria', 're', 'ssa', 'tat', 'uda', 'ura', 'ver', 'vi', 'xo']

def SUFn(text, suffix_list):
    found_suffixes = []
    for word in text:
        if any(word.endswith(s) for s in suffix_list):
            found_suffixes.append(1) # Break to avoid appending the same word multiple times if it has multiple suffixes
        else:
            found_suffixes.append(0)

    return found_suffixes

In [21]:
possible_prefixes = ['an', 'bi', 'co', 'di', 'en', 'ex', 'in', 'ir', 'mal', 'no', 'ob', 're', 'un', 'a', 'bi', 'ex', 'im', 'in', 'no', 're', 'sobre', 'sub', 'super', 'tele', 'trans', 'ultra', 'un', 'an', 'bi', 'di', 'dis', 'en', 'ex', 'in', 'intra', 'ir', 'mal', 'mi', 'ne', 'no', 'ob', 'pa', 'pe', 're', 'su', 'ta', 'te', 'to', 'an', 'en', 'in', 'ir', 'se', 'ta', 'te', 'to']

def PREFn(text, pref_list):
    found_pref = []
    for word in text:
        if any(word.endswith(p) for p in pref_list):
            found_pref.append(1) # Break to avoid appending the same word multiple times if it has multiple suffixes
        else:
            found_pref.append(0)

    return found_pref

In [22]:
special_words = ["nada", "ni", "nunca", "ningun", "ninguno", "ninguna", "alguna", "apenas","res", "mai", "cap", "ningu", "alguna", "només", "parece","sembla","veces", "vegades"]
def special(text, special_words):
    def_special = []
    for word in text:
        if any(word == words for words in special_words):
            def_special.append(1) # Break to avoid appending the same word multiple times if it has multiple suffixes
        else:
            def_special.append(0)

    return def_special

In [23]:
vocabulary_words = []

for entry in training_data:

    text = entry["data"]["text"]  # Extract the text from the JSON object

    labels = annotate_text(text)


    processed_text = pre_process_text(text, remove_punctuation_call=True, spell_check_call=False)

    for token, start, end in processed_text:

        vocabulary_words.append(token)

In [62]:
def extract_features(entry):
    if isinstance(entry, dict):
        text = entry["data"]["text"]  # Extract the text from the JSON object
    else:
        text = entry  # If entry is already the text string
    
    # Obtain labels using the annotate_text function
    labels = annotate_text(text)

    processed_text = pre_process_text(text, remove_punctuation_call=True, spell_check_call=False)

    vocabulary_words = []
    feature1_raw = []
    feature2 = []
    feature3 = []
    feature4 = []
    feature5 = []
    feature6 = []
    feature7 = []
    feature8 = []
    before_pos = []
    after_pos = []
    before_gram = []
    after_gram = []

    for token, start, end in processed_text:
        vocabulary_words.append(token)

        # Extract features
        feature1_raw.append(token)
        feature2.append(int(token.istitle()))
        feature3.append(int(any(char.isupper() for char in token)))
        try:
            float(token)
            feature4.append(1)
        except ValueError:
            feature4.append(0)
        feature5.append(int(any(char.isdigit() for char in token)))
        feature6.append(int('-' in token))
        feature7.append(int('_' in token))

    # POS tagging
    tagged_words = pos_tag_words(vocabulary_words)
    pos_tags = [tag for word, tag in tagged_words]
    feature8,_ = text_to_numbers(pos_tags)

    # Features related to POS tagging
    before_pos = beforepos(pos_tags, vocabulary_words)
    after_pos = afterpos(pos_tags, vocabulary_words)

    # Features related to n-gram contexts
    before_gram = beforegram(vocabulary_words)
    after_gram = aftergram(vocabulary_words)

    #Related to sufixes and prefixes.
    pref = PREFn(vocabulary_words, possible_prefixes)
    suf = SUFn(vocabulary_words, possible_suf)
    
    #Relevant words for what we need to find.
    spec = special(vocabulary_words,special_words)

    # Combine all features into one list
    all_features = [feature1_raw, feature2, feature3, feature4, feature5, feature6, feature7, feature8,
                    before_pos, after_pos, before_gram, after_gram, suf, pref, spec]
    

    # Transpose the feature matrix so each sublist corresponds to features for one token
    features = list(map(list, zip(*all_features)))
    
    if labels is not None:
        return features, labels
    else:
        return features, None

# Example usage:
X_train = []
y_train = []

for entry in training_data:
    features, labels = extract_features(entry)
    if features is not None:
        X_train.append(features)
        y_train.append(labels)

In [63]:
print(X_train[0])
print(y_train[0])

print(len(X_train))
print(len(y_train))

[['paciente', 0, 0, 0, 0, 0, 0, 1, ('paciente', 'NOUN', ['', '', '', '', '', '']), ('paciente', 'PRON'), ('paciente', []), ('paciente', [['paciente', 'que']])], ['que', 0, 0, 0, 0, 0, 0, 2, ('que', 'PRON', ['', '', '', '', '', 'NOUN']), ('que', 'VERB'), ('que', []), ('que', [['que', 'ingresa']])], ['ingresa', 0, 0, 0, 0, 0, 0, 3, ('ingresa', 'VERB', ['', '', '', '', 'NOUN', 'PRON']), ('ingresa', 'ADP'), ('ingresa', [['paciente', 'que']]), ('ingresa', [['ingresa', 'de']])], ['de', 0, 0, 0, 0, 0, 0, 4, ('de', 'ADP', ['', '', '', 'NOUN', 'PRON', 'VERB']), ('de', 'NOUN'), ('de', [['paciente', 'que'], ['que', 'ingresa']]), ('de', [['de', 'forma']])], ['forma', 0, 0, 0, 0, 0, 0, 1, ('forma', 'NOUN', ['', '', 'NOUN', 'PRON', 'VERB', 'ADP']), ('forma', 'ADJ'), ('forma', [['paciente', 'que'], ['que', 'ingresa'], ['ingresa', 'de']]), ('forma', [['forma', 'programada']])], ['programada', 0, 0, 0, 0, 0, 0, 5, ('programada', 'ADJ', ['', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN']), ('programada', 'ADP')

## <span style="color:red; font-size:larger;">**MODEL**</span>


In [64]:
# Define CRF model
crf_model = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)


# Function to train CRF model
def train_crf(X_train, y_train):
    # Encode features
    encoder = LabelEncoder()
    X_encoded = []
    for sentence in X_train:
        sentence_encoded = []
        for feature in sentence:
            feature_str = str(feature)  # Convert feature to string
            feature_encoded = encoder.fit_transform([feature_str])  # Encode as string
            sentence_encoded.append(feature_encoded[0])  # Append the first element (as it's a list)
        X_encoded.append([{'feature': encoded_feature} for encoded_feature in sentence_encoded])

    # Convert y_train to list of lists (if it's not already)
    if isinstance(y_train[0], str):
        y_train = [[label] for label in y_train]

    crf_model.fit(X_encoded, y_train)


# Example usage:
# Assuming X_train and y_train are obtained as in your example
train_crf(X_train, y_train)


ValueError: The numbers of items and labels differ: |x| = 493, |y| = 554

## <span style="color:red; font-size:larger;">**MODEL EVALUATION**</span>

In [None]:
# To evaluate on test data:
# X_test and y_test should be prepared in the same way as X_train and y_train
# evaluate_crf(X_test, y_test)

# To predict labels on new data:
# Assuming X_new is the feature matrix for new data
# predicted_labels = predict_crf(X_new)


# Function to evaluate CRF model
def evaluate_crf(X_test, y_test):
    y_pred = crf_model.predict(X_test)
    print(flat_classification_report(y_test, y_pred))