# **Deep Learning Method**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- Pere Mayol Carbonell NIU:1669503
- Andreu Gascón Marzo NIU:1670919
- Judith Zaragoza NIU:1634071

## **Library and Data Loading**

In [88]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import random
import numpy as np
import pandas as pd
import string

In [89]:
# Loading the training json file
loading_train = open("negacio_train_v2024.json",)
training_data = json.load(loading_train)
print("The training set contains: ", len(training_data), " samples")

The training set contains:  254  samples


In [90]:
# Loading the json object
loading_test = open("negacio_test_v2024.json")
test_data = json.load(loading_test)
print("The test set contains: ", len(test_data), " samples")

The test set contains:  64  samples


## **Data Annotation**
- Set using ground truth word to either O, NEG, UNC, NSCO, USCO

In [91]:
def tag_words_from_json(json_data):
    # Initialize lists for all tagged texts and counts
    all_tagged_texts = []
    all_counts = []

    # Iterate through each entry in the JSON data
    for entry in json_data:
        # Extract text and predictions from JSON entry
        text = entry['data']['text']
        predictions = entry['predictions'][0]['result']

        # Initialize counters
        counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}

        # Initialize tokens and their coordinates
        tokens = []
        start_pos = 0

        # Split text into tokens while tracking their start and end positions
        for word in text.split():
            start = text.find(word, start_pos)
            end = start + len(word)
            tokens.append((word, start, end))
            start_pos = end

        # Initialize tags list
        tags = ['O'] * len(tokens)

        # Tag each word
        for pred in predictions:
            pred_start = pred['value']['start']
            pred_end = pred['value']['end']
            label = pred['value']['labels'][0]

            if label in counts:
                counts[label] += 1
                for i, (word, start, end) in enumerate(tokens):
                    if start < pred_end and end > pred_start:
                        tags[i] = label

        # Combine tokens with tags
        tagged_text = [(token[0], (token[1], token[2]), tags[i]) for i, token in enumerate(tokens)]

        # Append the results to the lists
        all_tagged_texts.append(tagged_text)
        all_counts.append(counts)

    # Return the list of tagged words and counts for all entries
    return all_tagged_texts, all_counts

In [92]:
# Example usage with all entries
labels, counts_list = tag_words_from_json(training_data)

print("Number of entries in the entry:", len(labels))
print(labels[0]) # Print out an example entry
print(labels[0][0]) # Print out an example word
print(labels[0][0][2]) # Tag
len(labels[0])

Number of entries in the entry: 254
[('nº', (1, 3), 'O'), ('historia', (4, 12), 'O'), ('clinica:', (13, 21), 'O'), ('**', (22, 24), 'O'), ('***', (25, 28), 'O'), ('***', (29, 32), 'O'), ('nºepisodi:', (33, 43), 'O'), ('********', (44, 52), 'O'), ('sexe:', (53, 58), 'O'), ('home', (59, 63), 'O'), ('data', (64, 68), 'O'), ('de', (69, 71), 'O'), ('naixement:', (72, 82), 'O'), ('16.05.1936', (83, 93), 'O'), ('edat:', (94, 99), 'O'), ('82', (100, 102), 'O'), ('anys', (103, 107), 'O'), ('procedencia', (108, 119), 'O'), ('cex', (120, 123), 'O'), ('mateix', (124, 130), 'O'), ('hosp', (131, 135), 'O'), ('servei', (136, 142), 'O'), ('urologia', (143, 151), 'O'), ('data', (152, 156), 'O'), ("d'ingres", (157, 165), 'O'), ('24.07.2018', (166, 176), 'O'), ('data', (177, 181), 'O'), ("d'alta", (182, 188), 'O'), ('25.07.2018', (189, 199), 'O'), ('08:54:04', (200, 208), 'O'), ('ates', (209, 213), 'O'), ('per', (214, 217), 'O'), ('***************,', (218, 234), 'O'), ('*****;', (235, 241), 'O'), ('*****

554

In [93]:
word_feature = []
scope_feature = []
tag_feature = []
maj_number = []
contains_num = []
num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']

def num_in_word(word):
    for thing in word:
        if thing in num:
            return 1
    return 0

def proportion_number(word):
    count= 0
    for thing in word:
        if thing in num:
            count+=1
    if count > (len(word)/2):
        return 1
    return 0


for label in labels:
    for it in label:
        word_feature.append(it[0])
        scope_feature.append(it[1])
        tag_feature.append(it[2])

        word_separate = it[0].split()

        contains_num.append(num_in_word(it[0]))
        maj_number.append(proportion_number(it[0]))


In [94]:
print(f'The lenght of Word Feature is {len(word_feature)}')
print(f'The lenght of Scope Feature is {len(scope_feature)}')
print(f'The lenght of Tag Feature is {len(tag_feature)}')
print(f'The lenght of Majority Number Feature is {len(maj_number)}')
print(f'The lenght of Contains Number Feature is {len(contains_num)}')

The lenght of Word Feature is 185054
The lenght of Scope Feature is 185054
The lenght of Tag Feature is 185054
The lenght of Majority Number Feature is 185054
The lenght of Contains Number Feature is 185054


## **Feature Extraction**

Relevant features to be extracted:
- POS Tags
- NER Tags
- Dependency Parsing trees
- Coordinates
- Negation Cues and Uncertainty Cues
- Context Window????

In [95]:

#load spacy's spanish special.
nlp = spacy.load("es_core_news_md")
#to get the POS as a feature for each word.
def get_pos_tags(words):
    text = ' '.join(words) #convert the words to text.
    doc = nlp(text) #make it a doc using the function dowloaded before from spacy.

    #Create a dictionary where the key = word and the value = POS tag.
    pos_tags_dict = {token.text: token.pos_ for token in doc} 
    lemma_tags_dict = {token.text: token.lemma_ for token in doc} 
    
    pos_tags = [pos_tags_dict.get(word, 'O') for word in words]
    lemma_ = [lemma_tags_dict.get(word, 'O') for word in words]
    #print(f'Length of Words---> {len(pos_tags_dict)}') 
    return pos_tags, lemma_

seperate_texts = []
for label in labels:
    text = []
    for it in label:
        text.append(it[0])
    seperate_texts.append(text)


In [96]:

# Create a list of texts from the labels
separate_texts = []
for label in labels:
    text = []
    for it in label:
        if isinstance(it, str):
            text.append(it)
        elif isinstance(it, tuple) and isinstance(it[0], str):
            text.append(it[0])
    separate_texts.append(text)

# Get POS tags for each text
pos_texts = []
lemma_texts = []

for text in separate_texts:
    pos_tags, lemma_tags = get_pos_tags(text)
    pos_texts.append(pos_tags)
    lemma_texts.append(lemma_tags)


# Print lengths for comparison
print(len(separate_texts[0]))  # Length of the first text in separate_texts
print(len(pos_texts[0]))  # Length of the POS tags dictionary for the first text

# Print the POS tags for the first text as a sample
print(separate_texts[1])
print(pos_texts[1])


554
554
['nº', 'historia', 'clinica:', '**', '***', '***', 'nºepisodi:', '********', 'sexe:', 'dona', 'data', 'de', 'naixement:', '04.08.2000', 'edat:', '19', 'anys', 'procedencia', 'domicil/res.soc', 'servei', 'obstetricia', 'data', "d'ingres", '04.10.2019', 'data', "d'alta", '06.10.2019', '13:02:36', 'ates', 'per', '******************,', '*****;', '****************,', '********', 'informe', "d'alta", "d'hospitalitzacio", 'motiu', "d'ingres", 'treball', 'de', 'part', 'antecedents', 'no', 'al·lergies', 'medicamentoses', 'conegudes.', 'no', 'intervencions', 'quirurgiques', 'ni', 'altres', 'antecedents', 'patologics.', 'nega', 'habits', 'toxics.', 'no', 'medicacio', 'habitual.', 'evolucio', 'clinica', 'evolucion', 'parto', 'finaliza', 'por', 'parto', 'eutocico', 'el', 'dia', '04/10', 'a', 'las', '9:34h', 'obtencion', 'de', 'rn', ',', 'sexo:', 'masculino', ',', 'peso:', '2820', 'apgar', ':', '9/10', 'gs:', 'ab+', 'el', 'pueperio', 'transcurre', 'dentro', 'de', 'la', 'normalidad,', 'perman

In [97]:
pos_feature_pandas = []
lemma_feature_pandas = []

for text in pos_texts:
    for pos in text:
        pos_feature_pandas.append(pos)
for text in lemma_texts:
    for lemma in text:
        lemma_feature_pandas.append(lemma)

In [98]:
initial_scopes = []
final_scopes = []

for start, end in scope_feature:
    initial_scopes.append(start)
    final_scopes.append(end)
    
print(f'The lenght of Word Feature is {len(word_feature)}')
print(f'The lenght of Initial Scopes Feature is {len(initial_scopes)}')
print(f'The lenght of Initial Scopes Feature is {len(final_scopes)}')
print(f'The lenght of Tag Feature is {len(tag_feature)}')
print(f'The lenght of POS Feature is {len(pos_feature_pandas)}')
print(f'The lenght of LEMMA Feature is {len(lemma_feature_pandas)}')


The lenght of Word Feature is 185054
The lenght of Initial Scopes Feature is 185054
The lenght of Initial Scopes Feature is 185054
The lenght of Tag Feature is 185054
The lenght of POS Feature is 185054
The lenght of LEMMA Feature is 185054


In [99]:
import pandas as pd
data_tuples = list(zip(word_feature, initial_scopes, final_scopes, tag_feature, pos_feature_pandas, lemma_feature_pandas, contains_num, maj_number))

data = pd.DataFrame(data_tuples, columns=['Word', 'Initial Scopes','Final Scopes','Tag', 'POS', 'LEMMA', 'Contains NUMBER', 'Maj NUMBER'])

In [100]:
from IPython.display import display

def show_data(datos, first, second):
    styled_data = datos.iloc[first:second].style.set_properties(**{'text-align': 'left'})
    display(styled_data)

show_data(datos=data, first=41, second=100)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,Contains NUMBER,Maj NUMBER
41,paciente,315,323,O,NOUN,paciente,0,0
42,que,324,327,O,SCONJ,que,0,0
43,ingresa,328,335,O,VERB,ingresar,0,0
44,de,336,338,O,ADP,de,0,0
45,forma,339,344,O,NOUN,forma,0,0
46,programada,345,355,O,ADJ,programado,0,0
47,para,356,360,O,ADP,para,0,0
48,realizacion,361,372,O,PROPN,realizacion,0,0
49,de,373,375,O,ADP,de,0,0
50,uretrotomia,376,387,O,NOUN,uretrotomia,0,0


In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185054 entries, 0 to 185053
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Word             185054 non-null  object
 1   Initial Scopes   185054 non-null  int64 
 2   Final Scopes     185054 non-null  int64 
 3   Tag              185054 non-null  object
 4   POS              185054 non-null  object
 5   LEMMA            185054 non-null  object
 6   Contains NUMBER  185054 non-null  int64 
 7   Maj NUMBER       185054 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 11.3+ MB


In [102]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 0) & (data['Contains NUMBER'] == 1)] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,Contains NUMBER,Maj NUMBER
505,"38ºc,",3549,3554,O,O,O,1,0
972,4gr,1670,1673,O,NUM,4gr,1,0
974,c/6hrs,1677,1683,O,PROPN,c/6hrs,1,0
1042,c/8hrs,2124,2130,O,PROPN,c/8hrs,1,0
1207,x3,3274,3276,O,PROPN,x3,1,0
1218,4mg/6h,3368,3374,O,NUM,4mg/6h,1,0
1233,500mg/8h,3484,3492,O,NUM,500mg/8h,1,0
1442,1º,620,622,O,NUM,1º,1,0
1492,abril/18.,916,925,O,O,O,1,0
1511,septiembre/18,1082,1095,O,NOUN,septiembre/18,1,0


In [103]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 1) & (data['Contains NUMBER'] == 1)] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,Contains NUMBER,Maj NUMBER
13,16.05.1936,83,93,O,NUM,16.05.1936,1,1
15,82,100,102,O,NUM,82,1,1
25,24.07.2018,166,176,O,NUM,24.07.2018,1,1
28,25.07.2018,189,199,O,NUM,25.07.2018,1,1
29,08:54:04,200,208,O,NUM,08:54:04,1,1
98,81a,780,783,O,NUM,81a,1,1
163,(11/2017):,1206,1216,O,O,O,1,1
195,(250,1420,1424,O,O,O,1,1
258,(10/2017),1892,1901,O,O,O,1,1
287,1/2,2057,2060,O,NUM,1/2,1,1


In [104]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 1) & (data['Contains NUMBER'] == 0)] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,Contains NUMBER,Maj NUMBER


In [105]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 0) & (data['Contains NUMBER'] == 0)] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,Contains NUMBER,Maj NUMBER
0,nº,1,3,O,NUM,nº,0,0
1,historia,4,12,O,NOUN,historia,0,0
2,clinica:,13,21,O,O,O,0,0
3,**,22,24,O,O,O,0,0
4,***,25,28,O,O,O,0,0
5,***,29,32,O,O,O,0,0
6,nºepisodi:,33,43,O,O,O,0,0
7,********,44,52,O,O,O,0,0
8,sexe:,53,58,O,O,O,0,0
9,home,59,63,O,NOUN,home,0,0


## **MODEL**

## **MODEL EVALUATION**