# **Deep Learning Method**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- Pere Mayol Carbonell NIU:1669503
- Andreu Gascón Marzo NIU:1670919
- Judith Zaragoza NIU:1634071

## **Library and Data Loading**

In [None]:
# Libraries
import json
!pip install langdetect
!pip install spacy
import spacy
!python -m spacy download es_core_news_md
from langdetect import detect # Library for language detection
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from matplotlib import pyplot as plt

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/981.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m768.0/981.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=ae04c1b361be7d0b12f5ddad36eb4972fd5e57bef14c9de207145b53dc42977f
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b

In [None]:
# Loading the training json file
loading_train = open("negacio_train_v2024.json",)
training_data = json.load(loading_train)
print("The training set contains: ", len(training_data), " samples")

The training set contains:  254  samples


In [None]:
# Loading the json object
loading_test = open("negacio_test_v2024.json")
test_data = json.load(loading_test)
print("The test set contains: ", len(test_data), " samples")

The test set contains:  64  samples


## **Data Annotation**
- Set using ground truth word to either O, NEG, UNC, NSCO, USCO

In [None]:
def strip_tokens(tagged_text):
    def find_indices(tokens, start_token, end_token, occurrence=1):
        start_index = None
        end_index = None
        end_token_count = 0
        for i, (word, _, _) in enumerate(tokens):
            if word == start_token and start_index is None:
                start_index = i
            if word == end_token:
                end_token_count += 1
                if end_token_count == occurrence:
                    end_index = i
                    break
        return start_index, end_index

    # Remove sections from "nº" to the second occurrence of "d'ingres"
    start1, end1 = find_indices(tagged_text, 'nº', "d'ingres", occurrence=2)
    if end1 is not None:
        end1 += 1  # Include the end token

    # If the indices are found, remove the section
    new_tagged_text = []
    if start1 is not None and end1 is not None:
        new_tagged_text = tagged_text[:start1] + tagged_text[end1:]
    else:
        new_tagged_text = tagged_text[:]

    # Continuously find and remove all sections from "nhc" to "lopd"
    while True:
        start2, end2 = find_indices(new_tagged_text, 'nhc', 'lopd')
        if start2 is not None and end2 is not None:
            end2 += 1  # Include the end token
            new_tagged_text = new_tagged_text[:start2] + new_tagged_text[end2:]
        else:
            break

    return new_tagged_text

def tag_words_from_json(json_data):
    # Initialize lists for all tagged texts and counts
    all_tagged_texts = []
    all_counts = []

    # Iterate through each entry in the JSON data
    for entry in json_data:
        # Extract text and predictions from JSON entry
        text = entry['data']['text']
        predictions = entry['predictions'][0]['result']

        # Initialize counters
        counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}

        # Initialize tokens and their coordinates
        tokens = []
        start_pos = 0

        # Split text into tokens while tracking their start and end positions
        for word in text.split():
            start = text.find(word, start_pos)
            end = start + len(word)
            tokens.append((word, start, end))
            start_pos = end

        # Initialize tags list
        tags = ['O'] * len(tokens)

        # Tag each word
        for pred in predictions:
            pred_start = pred['value']['start']
            pred_end = pred['value']['end']
            label = pred['value']['labels'][0]

            if label in counts:
                counts[label] += 1
                for i, (word, start, end) in enumerate(tokens):
                    if start < pred_end and end > pred_start:
                        tags[i] = label

        # Combine tokens with tags
        tagged_text = [(token[0], (token[1], token[2]), tags[i]) for i, token in enumerate(tokens)]

        # Strip unwanted tokens
        stripped_tagged_text = strip_tokens(tagged_text)

        # Append the results to the lists
        all_tagged_texts.append(stripped_tagged_text)
        all_counts.append(counts)

    # Return the list of tagged words and counts for all entries
    return all_tagged_texts, all_counts

In [None]:
# Example usage with all entries
labels, counts_list = tag_words_from_json(training_data)

In [None]:
labels, counts_list = tag_words_from_json(test_data)

In [None]:
print("Number of entries in the entry:", len(labels))
print(labels[0]) # Print out an example entry
print(labels[0][0]) # Print out an example word
print(labels[0][0][2]) # Tag
len(labels[0])

Number of entries in the entry: 64
[('induccion', (319, 328), 'O'), ('al', (329, 331), 'O'), ('parto', (332, 337), 'O'), ('por', (338, 341), 'O'), ('pequeño', (342, 349), 'O'), ('para', (350, 354), 'O'), ('la', (355, 357), 'O'), ('edad', (358, 362), 'O'), ('gestacional', (363, 374), 'O'), ('(', (375, 376), 'O'), ('peg', (377, 380), 'O'), (')', (381, 382), 'O'), ('antecedents', (383, 394), 'O'), ('no', (395, 397), 'NEG'), ('alergias', (398, 406), 'NSCO'), ('medicamentosas', (407, 421), 'NSCO'), ('conocidas', (422, 431), 'O'), ('antcededentes', (432, 445), 'O'), ('medico-quirurgicos:', (446, 465), 'O'), ('protesis', (466, 474), 'O'), ('mamaria,', (475, 483), 'O'), ('adenoidectomia', (484, 498), 'O'), ('niega', (499, 504), 'NEG'), ('habitos', (505, 512), 'NSCO'), ('toxicos', (513, 520), 'NSCO'), ('medicacio', (521, 530), 'O'), ('habitual', (531, 539), 'O'), ('anafranil25', (540, 551), 'O'), ('mg/', (552, 555), 'O'), ('diario.', (556, 563), 'O'), ('yodocefol.', (564, 574), 'O'), ('hierro',

578

In [None]:
word_feature = []
scope_feature = []
tag_feature = []
maj_number = []
contains_num = []
is_number = []
num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']

def num_in_word(word):
    for thing in word:
        if thing in num:
            return 1
    return 0

def num_(word):
    if word.isdigit():
            return 1
    return 0

def proportion_number(word):
    count= 0
    for thing in word:
        if thing in num:
            count+=1
    if count > (len(word)/2):
        return 1
    return 0

for label in labels:
    for it in label:
        word_feature.append(it[0])
        scope_feature.append(it[1])
        tag_feature.append(it[2])

        word_separate = it[0].split()

        contains_num.append(num_in_word(it[0]))
        maj_number.append(proportion_number(it[0]))
        is_number.append(num_(it[0]))


In [None]:
feature_dict = {}
for i in range(len(word_feature)):
    feature_dict[word_feature[i]] = {
        'Scope': scope_feature[i],
        'Tag': tag_feature[i],
        'Maj NUMBER': maj_number[i],
        'Contains NUMBER': contains_num[i],
        'Is NUMBER': is_number[i]
    }
first_item = next(iter(feature_dict.items()))
print(first_item)

('induccion', {'Scope': (2091, 2100), 'Tag': 'O', 'Maj NUMBER': 0, 'Contains NUMBER': 0, 'Is NUMBER': 0})


In [None]:
print(f'The lenght of Word Feature is {len(word_feature)}')
print(f'The lenght of Scope Feature is {len(scope_feature)}')
print(f'The lenght of Tag Feature is {len(tag_feature)}')
print(f'The lenght of Majority Number Feature is {len(maj_number)}')
print(f'The lenght of Contains Number Feature is {len(contains_num)}')

The lenght of Word Feature is 43850
The lenght of Scope Feature is 43850
The lenght of Tag Feature is 43850
The lenght of Majority Number Feature is 43850
The lenght of Contains Number Feature is 43850


## **Feature Extraction**

In [None]:

#load spacy's spanish special.
nlp = spacy.load("es_core_news_md")
#to get the POS as a feature for each word.
def get_pos_tags(words):
    text = ' '.join(words) #convert the words to text.
    doc = nlp(text) #make it a doc using the function dowloaded before from spacy.

    #Create a dictionary where the key = word and the value = POS tag.
    pos_tags_dict = {token.text: token.pos_ for token in doc}
    lemma_tags_dict = {token.text: token.lemma_ for token in doc}

    pos_tags = [pos_tags_dict.get(word, 'O') for word in words]
    lemma_ = [lemma_tags_dict.get(word, 'O') for word in words]
    #print(f'Length of Words---> {len(pos_tags_dict)}')
    return pos_tags, lemma_

seperate_texts = []
for label in labels:
    text = []
    for it in label:
        text.append(it[0])
    seperate_texts.append(text)

In [None]:
# Create a list of texts from the labels
separate_texts = []
for label in labels:
    text = []
    for it in label:
        if isinstance(it, str):
            text.append(it)
        elif isinstance(it, tuple) and isinstance(it[0], str):
            text.append(it[0])
    separate_texts.append(text)

# Get POS tags for each text
pos_texts = []
lemma_texts = []

for text in separate_texts:
    pos_tags, lemma_tags = get_pos_tags(text)
    pos_texts.append(pos_tags)
    lemma_texts.append(lemma_tags)


# Print lengths for comparison
print(len(separate_texts[0]))  # Length of the first text in separate_texts
print(len(pos_texts[0]))  # Length of the POS tags dictionary for the first text

# Print the POS tags for the first text as a sample
print(separate_texts[1])
print(pos_texts[1])

578
578
['varon', '22', 'años', 'que', 'acude', 'por', 'fiebre.', 'antecedents', 'alergias', 'no', 'refiere', 'alergias', 'medicamentosas', 'conocidas.', 'antecedenes', 'personales', 'niega', 'habitos', 'toxicos.', 'antecedentes', 'medicos:', '-', 'infeccion', 'por', 'vih', 'diagnosticada', 'en', 'octubre/2015,', 'control', 'por', 'dr', '***************,', 'en', 'tar', 'con', 'triumeq', 'desde', 'octubre', '2015', 'con', 'buen', 'cumplimiento', 'terapeutico.', 'en', 'seguimiento', 'en', 'cex', 'infecciosas', '(dr.', '*******).', '**ultima', 'analitica', '(nov', '17)', 'cv', '&lt;50,', 'cd4', '1300.', '**serologies:', 'hbsag', 'negatiu,', 'hbsac', 'positiu,', 'hbcac', 'negatiu,', 'vhc', 'negativa', '-antecedentes', 'its:', '**uretritis', 'gonococica', 'en', 'mayo', '2017,', 'realizo', 'tratamientio.', '**lues', 'latente', 'en', 'sept', '2017', 'tratada', 'con', '3', 'dosis', 'de', 'penicilina', 'im.', '-hsil', 'anal', '(displasia', 'moderada-ain2),', 'vph', '18,42,58,81.', 'en', 'curso'

In [None]:
pos_feature_pandas = []
lemma_feature_pandas = []
sentence_id = []
sentence_idx = 0  # Initialize sentence index
token_count = 0  # Initialize token count

# Assuming pos_texts and lemma_texts are lists of lists, where each inner list represents a tokenized sentence
for pos_sentence, lemma_sentence in zip(pos_texts, lemma_texts):
    for pos, lemma in zip(pos_sentence, lemma_sentence):
        pos_feature_pandas.append(pos)
        lemma_feature_pandas.append(lemma)
        sentence_id.append(sentence_idx)

        token_count += 1  # Increment token count

        # Check if 200 tokens have been added
        if token_count >= 200:
            sentence_idx += 1  # Increment sentence index
            token_count = 0  # Reset token count

# Handle any remaining tokens that didn't complete the last 200-token block
if token_count > 0:
    sentence_idx += 1  # This ensures that even the last block has a unique index if it's less than 200 tokens

print()




In [None]:
initial_scopes = []
final_scopes = []

for start, end in scope_feature:
    initial_scopes.append(start)
    final_scopes.append(end)

print(f'The lenght of Word Feature is {len(word_feature)}')
print(f'The lenght of Initial Scopes Feature is {len(initial_scopes)}')
print(f'The lenght of Initial Scopes Feature is {len(final_scopes)}')
print(f'The lenght of Tag Feature is {len(tag_feature)}')
print(f'The lenght of POS Feature is {len(pos_feature_pandas)}')
print(f'The lenght of LEMMA Feature is {len(lemma_feature_pandas)}')


The lenght of Word Feature is 43850
The lenght of Initial Scopes Feature is 43850
The lenght of Initial Scopes Feature is 43850
The lenght of Tag Feature is 43850
The lenght of POS Feature is 43850
The lenght of LEMMA Feature is 43850


In [None]:
data_tuples = list(zip(word_feature, initial_scopes, final_scopes, tag_feature, pos_feature_pandas, lemma_feature_pandas, is_number,contains_num, maj_number,sentence_id))
data = pd.DataFrame(data_tuples, columns=['Word', 'Initial Scopes','Final Scopes','Tag', 'POS', 'LEMMA', 'NUMBER', 'Contains NUMBER', 'Maj NUMBER', 'text_id'])

In [None]:
# Export the DataFrame to a CSV file
csv_file_path = '/content/feature_extracted_dataTEST.csv'
data.to_csv(csv_file_path, index=False)

In [None]:
from IPython.display import display

def show_data(datos, first, second):
    styled_data = datos.iloc[first:second].style.set_properties(**{'text-align': 'left'})
    display(styled_data)

show_data(datos=data, first=41, second=100)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
41,lesiones,660,668,O,NOUN,lesión,0,0,0,0
42,cutaneas,669,677,O,ADJ,cutanea,0,0,0,0
43,con,678,681,O,ADP,con,0,0,0,0
44,anestesia,682,691,O,NOUN,anestesia,0,0,0,0
45,local,692,697,O,ADJ,local,0,0,0,0
46,protesis,698,706,O,NOUN,protesis,0,0,0,0
47,total,707,712,O,ADJ,total,0,0,0,0
48,de,713,715,O,ADP,de,0,0,0,0
49,cadera,716,722,O,NOUN,cadera,0,0,0,0
50,cordectomia,723,734,O,ADJ,cordectomio,0,0,0,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168887 entries, 0 to 168886
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Word             168887 non-null  object
 1   Initial Scopes   168887 non-null  int64 
 2   Final Scopes     168887 non-null  int64 
 3   Tag              168887 non-null  object
 4   POS              168887 non-null  object
 5   LEMMA            168887 non-null  object
 6   NUMBER           168887 non-null  int64 
 7   Contains NUMBER  168887 non-null  int64 
 8   Maj NUMBER       168887 non-null  int64 
 9   text_id          168887 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 12.9+ MB


In [None]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 0) & (data['Contains NUMBER'] == 1)]
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
456,"38ºc,",3549,3554,O,O,O,0,1,0,2
818,4gr,1670,1673,O,NUM,4gr,0,1,0,4
820,c/6hrs,1677,1683,O,PROPN,c/6hrs,0,1,0,4
880,c/8hrs,2124,2130,O,PROPN,c/8hrs,0,1,0,4
1045,x3,3274,3276,O,PROPN,x3,0,1,0,5
1056,4mg/6h,3368,3374,O,NUM,4mg/6h,0,1,0,5
1071,500mg/8h,3484,3492,O,NUM,500mg/8h,0,1,0,5
1224,1º,620,622,O,NUM,1º,0,1,0,6
1274,abril/18.,916,925,O,O,O,0,1,0,6
1293,septiembre/18,1082,1095,O,NOUN,septiembre/18,0,1,0,6


In [None]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 1) & (data['Contains NUMBER'] == 1)]
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
57,81a,780,783,O,NUM,81a,0,1,1,0
122,(11/2017):,1206,1216,O,O,O,0,1,1,0
154,(250,1420,1424,O,O,O,0,1,1,0
217,(10/2017),1892,1901,O,O,O,0,1,1,1
243,24,2086,2088,O,NUM,24,1,1,1,1
247,2018,2101,2105,O,NUM,2018,1,1,1,1
335,n40.0,2741,2746,O,ADJ,n40.0,0,1,1,1
347,04.81,2839,2844,O,NUM,04.81,0,1,1,1
357,58.0,2904,2908,O,NUM,58.0,0,1,1,1
433,14,3411,3413,O,NUM,14,1,1,1,2


In [None]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 1) & (data['Contains NUMBER'] == 0)]
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id


In [None]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 0) & (data['Contains NUMBER'] == 0)]
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
0,paciente,315,323,O,NOUN,paciente,0,0,0,0
1,que,324,327,O,SCONJ,que,0,0,0,0
2,ingresa,328,335,O,VERB,ingresar,0,0,0,0
3,de,336,338,O,ADP,de,0,0,0,0
4,forma,339,344,O,NOUN,forma,0,0,0,0
5,programada,345,355,O,ADJ,programado,0,0,0,0
6,para,356,360,O,ADP,para,0,0,0,0
7,realizacion,361,372,O,PROPN,realizacion,0,0,0,0
8,de,373,375,O,ADP,de,0,0,0,0
9,uretrotomia,376,387,O,NOUN,uretrotomia,0,0,0,0


In [None]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[data['NUMBER'] == 1]
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
243,24,2086,2088,O,NUM,24,1,1,1,1
247,2018,2101,2105,O,NUM,2018,1,1,1,1
433,14,3411,3413,O,NUM,14,1,1,1,2
445,1,3498,3499,O,NUM,1,1,1,1,2
448,8,3507,3508,O,NUM,8,1,1,1,2
540,2820,626,630,O,NUM,2820,1,1,1,2
604,1,1163,1164,O,NUM,1,1,1,1,3
610,1,1190,1191,O,NUM,1,1,1,1,3
679,66,691,693,O,NUM,66,1,1,1,3
694,3,815,816,O,NUM,3,1,1,1,3
