# **Deep Learning Method**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- Pere Mayol Carbonell NIU:1669503
- Andreu Gascón Marzo NIU:1670919
- Judith Zaragoza NIU:1634071

## **Library and Data Loading**

In [41]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import random
import numpy as np

!pip install tensorflow
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, concatenate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score



In [42]:
# Loading the training json file
loading_train = open("negacio_train_v2024.json",)
training_data = json.load(loading_train)
print("The training set contains: ", len(training_data), " samples")

The training set contains:  254  samples


In [43]:
# Loading the json object
loading_test = open("negacio_test_v2024.json")
test_data = json.load(loading_test)
print("The test set contains: ", len(test_data), " samples")

The test set contains:  64  samples


## **Data Annotation**
- Set using ground truth word to either O, NEG, UNC, NSCO, USCO

In [44]:
def tag_words_from_json(json_data):
    # Extract text and predictions from JSON data
    text = json_data[0]['data']['text']
    predictions = json_data[0]['predictions'][0]['result']

    # Initialize counters
    counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}

    # Initialize tokens and their coordinates
    tokens = []
    start_pos = 0

    # Split text into tokens while tracking their start and end positions
    for word in text.split():
        start = text.find(word, start_pos)
        end = start + len(word)
        tokens.append((word, start, end))
        start_pos = end

    # Initialize tags list
    tags = ['O'] * len(tokens)

    # Tag each word
    for pred in predictions:
        pred_start = pred['value']['start']
        pred_end = pred['value']['end']
        label = pred['value']['labels'][0]

        if label in counts:
            counts[label] += 1
            for i, (word, start, end) in enumerate(tokens):
                if start < pred_end and end > pred_start:
                    tags[i] = label

    # Combine tokens with tags
    tagged_text = [(token[0], (token[1], token[2]), tags[i]) for i, token in enumerate(tokens)]

    # Return tagged words with coordinates and counts
    return tagged_text, counts

In [45]:
# EXAMPLE USAGE
# Tag the words from the training data
tags, counts = tag_words_from_json(training_data)
print("Tagged Words:")
print(tags)
print("\nNumber of tokens in the entry:", len(tags))
print("\nCounts:")
print(counts)

Tagged Words:
[('nº', (1, 3), 'O'), ('historia', (4, 12), 'O'), ('clinica:', (13, 21), 'O'), ('**', (22, 24), 'O'), ('***', (25, 28), 'O'), ('***', (29, 32), 'O'), ('nºepisodi:', (33, 43), 'O'), ('********', (44, 52), 'O'), ('sexe:', (53, 58), 'O'), ('home', (59, 63), 'O'), ('data', (64, 68), 'O'), ('de', (69, 71), 'O'), ('naixement:', (72, 82), 'O'), ('16.05.1936', (83, 93), 'O'), ('edat:', (94, 99), 'O'), ('82', (100, 102), 'O'), ('anys', (103, 107), 'O'), ('procedencia', (108, 119), 'O'), ('cex', (120, 123), 'O'), ('mateix', (124, 130), 'O'), ('hosp', (131, 135), 'O'), ('servei', (136, 142), 'O'), ('urologia', (143, 151), 'O'), ('data', (152, 156), 'O'), ("d'ingres", (157, 165), 'O'), ('24.07.2018', (166, 176), 'O'), ('data', (177, 181), 'O'), ("d'alta", (182, 188), 'O'), ('25.07.2018', (189, 199), 'O'), ('08:54:04', (200, 208), 'O'), ('ates', (209, 213), 'O'), ('per', (214, 217), 'O'), ('***************,', (218, 234), 'O'), ('*****;', (235, 241), 'O'), ('****************,', (242, 2

In [46]:
def tag_words_from_json(json_data):
    # Initialize lists for all tagged texts and counts
    all_tagged_texts = []
    all_counts = []

    # Iterate through each entry in the JSON data
    for entry in json_data:
        # Extract text and predictions from JSON entry
        text = entry['data']['text']
        predictions = entry['predictions'][0]['result']

        # Initialize counters
        counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}

        # Initialize tokens and their coordinates
        tokens = []
        start_pos = 0

        # Split text into tokens while tracking their start and end positions
        for word in text.split():
            start = text.find(word, start_pos)
            end = start + len(word)
            tokens.append((word, start, end))
            start_pos = end

        # Initialize tags list
        tags = ['O'] * len(tokens)

        # Tag each word
        for pred in predictions:
            pred_start = pred['value']['start']
            pred_end = pred['value']['end']
            label = pred['value']['labels'][0]

            if label in counts:
                counts[label] += 1
                for i, (word, start, end) in enumerate(tokens):
                    if start < pred_end and end > pred_start:
                        tags[i] = label

        # Combine tokens with tags
        tagged_text = [(token[0], (token[1], token[2]), tags[i]) for i, token in enumerate(tokens)]

        # Append the results to the lists
        all_tagged_texts.append(tagged_text)
        all_counts.append(counts)

    # Return the list of tagged words and counts for all entries
    return all_tagged_texts, all_counts

In [47]:
# Example usage with all entries
labels, counts_list = tag_words_from_json(training_data)

print("Number of entries in the entry:", len(labels))
print(labels[0]) # Print out an example entry
print(labels[0][0]) # Print out an example word
print(labels[0][0][2]) # Tag
len(labels[0])

Number of entries in the entry: 254
[('nº', (1, 3), 'O'), ('historia', (4, 12), 'O'), ('clinica:', (13, 21), 'O'), ('**', (22, 24), 'O'), ('***', (25, 28), 'O'), ('***', (29, 32), 'O'), ('nºepisodi:', (33, 43), 'O'), ('********', (44, 52), 'O'), ('sexe:', (53, 58), 'O'), ('home', (59, 63), 'O'), ('data', (64, 68), 'O'), ('de', (69, 71), 'O'), ('naixement:', (72, 82), 'O'), ('16.05.1936', (83, 93), 'O'), ('edat:', (94, 99), 'O'), ('82', (100, 102), 'O'), ('anys', (103, 107), 'O'), ('procedencia', (108, 119), 'O'), ('cex', (120, 123), 'O'), ('mateix', (124, 130), 'O'), ('hosp', (131, 135), 'O'), ('servei', (136, 142), 'O'), ('urologia', (143, 151), 'O'), ('data', (152, 156), 'O'), ("d'ingres", (157, 165), 'O'), ('24.07.2018', (166, 176), 'O'), ('data', (177, 181), 'O'), ("d'alta", (182, 188), 'O'), ('25.07.2018', (189, 199), 'O'), ('08:54:04', (200, 208), 'O'), ('ates', (209, 213), 'O'), ('per', (214, 217), 'O'), ('***************,', (218, 234), 'O'), ('*****;', (235, 241), 'O'), ('*****

554

In [48]:
word_feature = []
scope_feature = []
tag_feature = []

for label in labels:
    for it in label:
        word_feature.append(it[0])
        scope_feature.append(it[1])
        tag_feature.append(it[2])


In [49]:
print(f'The lenght of Word Feature is {len(word_feature)}')
print(f'The lenght of Scope Feature is {len(scope_feature)}')
print(f'The lenght of Tag Feature is {len(tag_feature)}')

The lenght of Word Feature is 185054
The lenght of Scope Feature is 185054
The lenght of Tag Feature is 185054


## **Feature Extraction**

Relevant features to be extracted:
- POS Tags
- NER Tags
- Dependency Parsing trees
- Coordinates
- Negation Cues and Uncertainty Cues
- Context Window????

In [57]:

#load spacy's spanish special.
nlp = spacy.load("es_core_news_md")
#to get the POS as a feature for each word.
def get_pos_tags(words):
    text = ' '.join(words) #convert the words to text.
    doc = nlp(text) #make it a doc using the function dowloaded before from spacy.

    #Create a dictionary where the key = word and the value = POS tag.
    pos_tags_dict = {token.text: token.pos_ for token in doc} 
    
    pos_tags = [pos_tags_dict.get(word, 'O') for word in words]
    #print(f'Length of Words---> {len(pos_tags_dict)}') 
    return pos_tags

seperate_texts = []
for label in labels:
    text = []
    for it in label:
        text.append(it[0])
    seperate_texts.append(text)


In [62]:

# Create a list of texts from the labels
separate_texts = []
for label in labels:
    text = []
    for it in label:
        if isinstance(it, str):
            text.append(it)
        elif isinstance(it, tuple) and isinstance(it[0], str):
            text.append(it[0])
    separate_texts.append(text)

# Get POS tags for each text
pos_texts = []
for text in separate_texts:
    pos_tags = get_pos_tags(text)
    pos_texts.append(pos_tags)

# Print lengths for comparison
print(len(separate_texts[0]))  # Length of the first text in separate_texts
print(len(pos_texts[0]))  # Length of the POS tags dictionary for the first text

# Print the POS tags for the first text as a sample
print(separate_texts[0])
print(pos_texts[0])

554
554
['nº', 'historia', 'clinica:', '**', '***', '***', 'nºepisodi:', '********', 'sexe:', 'home', 'data', 'de', 'naixement:', '16.05.1936', 'edat:', '82', 'anys', 'procedencia', 'cex', 'mateix', 'hosp', 'servei', 'urologia', 'data', "d'ingres", '24.07.2018', 'data', "d'alta", '25.07.2018', '08:54:04', 'ates', 'per', '***************,', '*****;', '****************,', '******', 'informe', "d'alta", "d'hospitalitzacio", 'motiu', "d'ingres", 'paciente', 'que', 'ingresa', 'de', 'forma', 'programada', 'para', 'realizacion', 'de', 'uretrotomia', 'interna', '.', 'antecedents', 'alergia', 'a', 'penicilina', 'y', 'cloramfenicol', '.', 'no', 'habitos', 'toxicos.', 'antecedentes', 'medicos:', 'bloqueo', 'auriculoventricular', 'de', 'primer', 'grado', 'hipertension', 'arterial.', 'diverticulosis', 'extensa', 'insuficiencia', 'renal', 'cronica', 'colelitiasis', 'antecedentes', 'quirurgicos:', 'exeresis', 'de', 'lesiones', 'cutaneas', 'con', 'anestesia', 'local', 'protesis', 'total', 'de', 'cader

In [63]:
pos_feature_pandas = []

for text in pos_texts:
    for pos in text:
        pos_feature_pandas.append(pos)

print(f'The lenght of Word Feature is {len(word_feature)}')
print(f'The lenght of Scope Feature is {len(scope_feature)}')
print(f'The lenght of Tag Feature is {len(tag_feature)}')
print(f'The lenght of POS Feature is {len(pos_feature_pandas)}')

The lenght of Word Feature is 185054
The lenght of Scope Feature is 185054
The lenght of Tag Feature is 185054
The lenght of POS Feature is 185054


In [72]:
initial_scopes = []
final_scopes = []

for start, end in scope_feature:
    initial_scopes.append(start)
    final_scopes.append(end)
    

print(f'The lenght of Initial Scopes Feature is {len(initial_scopes)}')
print(f'The lenght of Initial Scopes Feature is {len(final_scopes)}')

The lenght of Initial Scopes Feature is 185054
The lenght of Initial Scopes Feature is 185054


In [73]:
import pandas as pd
data_tuples = list(zip(word_feature, initial_scopes, final_scopes, tag_feature, pos_feature_pandas))

data = pd.DataFrame(data_tuples, columns=['Word', 'Initial Scopes','Final Scopes','Tag', 'POS'])

In [78]:
data.head(60)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS
0,nº,1,3,O,NUM
1,historia,4,12,O,NOUN
2,clinica:,13,21,O,O
3,**,22,24,O,O
4,***,25,28,O,O
5,***,29,32,O,O
6,nºepisodi:,33,43,O,O
7,********,44,52,O,O
8,sexe:,53,58,O,O
9,home,59,63,O,NOUN


In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185054 entries, 0 to 185053
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Word            185054 non-null  object
 1   Initial Scopes  185054 non-null  int64 
 2   Final Scopes    185054 non-null  int64 
 3   Tag             185054 non-null  object
 4   POS             185054 non-null  object
dtypes: int64(2), object(3)
memory usage: 7.1+ MB


## **MODEL**

## **MODEL EVALUATION**