# **Deep Learning Method**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- Pere Mayol Carbonell NIU:1669503
- Andreu Gascón Marzo NIU:1670919
- Judith Zaragoza NIU:1634071

## **Library and Data Loading**

In [68]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import random
import string

import pandas as pd 


In [69]:
# Loading the training json file
loading_train = open("negacio_train_v2024.json",)
training_data = json.load(loading_train)
print("The training set contains: ", len(training_data), " samples")

The training set contains:  254  samples


In [70]:
# Loading the json object
loading_test = open("negacio_test_v2024.json")
test_data = json.load(loading_test)
print("The test set contains: ", len(test_data), " samples")

The test set contains:  64  samples


In [71]:
def remove_pacient_info(text):
    # Remove lines starting with "nº historia clinica:" and ending with "motiu d'ingres"
    text = re.sub(r'nº historia clinica:.*?motiu d\'ingres', '', text, flags=re.DOTALL)
    # Remove lines starting with "nhc" and ending with "lopd"
    text = re.sub(r'nhc.*?lopd', '', text, flags=re.DOTALL)
    # Remove all asterisks '*'
    text = text.replace('*', '')
    return text


# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(text):
    # Define a translation table to map punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text

## **Data Annotation**
- Set using ground truth word to either O, NEG, UNC, NSCO, USCO

In [72]:
def tag_words_from_json(json_data):
    counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}
    words = []
    tagging = []

    for item in range(len(json_data)):
        text = json_data[item]['data']['text']
        predictions = json_data[0]['predictions'][0]['result']
        text_info = remove_pacient_info(text)
        text_pun = remove_punctuation(text_info)
        # Tokenize text
        tokens = text_pun.split()
        words.append(tokens)

        # Initialize tags list
        tags = ['O'] * len(tokens)

        # Tag each word
        for pred in predictions:
            start = pred['value']['start']
            end = pred['value']['end']
            label = pred['value']['labels'][0]
            
            if label in counts:
                counts[label] += 1
                for i in range(len(tokens)):
                    if start <= len(' '.join(tokens[:i+1])) < end:
                        tags[i] = label
        tagging.append(tags)
    # Return tagged words and counts
    return tagging, counts, words

In [73]:
# EXAMPLE USAGE
# Tag the words from the training data
tags, counts, words = tag_words_from_json(training_data)
print("Tagged Words:")
print(tags)
print("Words:")
print(words[0])
print("\nCounts:")
print(counts)
print('\n-----------------------------------')
print(f'Lenght of Taggs---> {len(tags)} \nLenght of Words---> {len(words)}')

Tagged Words:
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NSCO', 'NSCO', 'NSCO', 'NSCO', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NEG', 'NSCO', 'NSCO', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NEG', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [74]:
words_feature = []
for i in range(len(words)):
    for word in words[i]:
        words_feature.append(word)


tags_feature = []
for i in range(len(tags)):
    for tag in tags[i]:
        tags_feature.append(tag)

In [75]:
print(words_feature[:100])
print(tags_feature[:100])

['paciente', 'que', 'ingresa', 'de', 'forma', 'programada', 'para', 'realizacion', 'de', 'uretrotomia', 'interna', 'antecedents', 'alergia', 'a', 'penicilina', 'y', 'cloramfenicol', 'no', 'habitos', 'toxicos', 'antecedentes', 'medicos', 'bloqueo', 'auriculoventricular', 'de', 'primer', 'grado', 'hipertension', 'arterial', 'diverticulosis', 'extensa', 'insuficiencia', 'renal', 'cronica', 'colelitiasis', 'antecedentes', 'quirurgicos', 'exeresis', 'de', 'lesiones', 'cutaneas', 'con', 'anestesia', 'local', 'protesis', 'total', 'de', 'cadera', 'cordectomia', 'herniorrafia', 'inguinal', 'proces', 'actual', 'varon', 'de', '81a', 'que', 'a', 'raiz', 'de', 'episodio', 'de', 'hematuria', 'macroscopica', 'se', 'realiza', 'cistoscopia', 'que', 'es', 'negativa', 'para', 'lesiones', 'malignas', 'pero', 'se', 'objetiva', 'estenosis', 'de', 'uretra', 'se', 'intentan', 'dilataciones', 'progresivas', 'en', 'el', 'gabinete', 'de', 'urologia', 'sin', 'exito', 'se', 'solicita', 'estudio', 'de', 'imagen', '

In [76]:
print(f'Lenght of words feature---->{len(words_feature)}')
print(f'Lenght of tags feature---->{len(tags_feature)}')

Lenght of words feature---->165073
Lenght of tags feature---->165073


In [77]:
#load spacy's spanish special.
nlp = spacy.load("es_core_news_md")
#to get the POS as a feature for each word.
def get_pos_tags(words):
    text = ' '.join(words) #convert the words to text.
    doc = nlp(text) #make it a doc using the function dowloaded before from spacy.

    #Create a dictionary where the key = word and the value = POS tag.
    pos_tags_dict = {token.text: token.pos_ for token in doc} 
    
    #print(f'Length of Words---> {len(pos_tags_dict)}') 
    return pos_tags_dict

pos_tags_all = []
for text in words:
    pos_tags_dict = get_pos_tags(words=text)
    pos_tags_all.append(pos_tags_dict)


In [78]:
pos_tags_feature = []
for texts in pos_tags_all:
    for word, tag in texts.items():
        pos_tags_feature.append([word,tag])
print(pos_tags_all[1])

{'treball': 'PROPN', 'de': 'ADP', 'part': 'PROPN', 'antecedents': 'ADJ', 'no': 'ADV', 'al·lergies': 'PROPN', 'medicamentoses': 'PROPN', 'conegudes': 'ADJ', 'intervencions': 'VERB', 'quirurgiques': 'PROPN', 'ni': 'CCONJ', 'altres': 'NUM', 'patologics': 'PROPN', 'nega': 'VERB', 'habits': 'VERB', 'toxics': 'PROPN', 'medicacio': 'NOUN', 'habitual': 'ADJ', 'evolucio': 'NOUN', 'clinica': 'ADJ', 'evolucion': 'PROPN', 'parto': 'PROPN', 'finaliza': 'VERB', 'por': 'ADP', 'eutocico': 'ADJ', 'el': 'DET', 'dia': 'NOUN', '0410': 'NUM', 'a': 'ADP', 'las': 'DET', '934h': 'NUM', 'obtencion': 'PROPN', 'rn': 'PROPN', 'sexo': 'PROPN', 'masculino': 'ADJ', 'peso': 'NOUN', '2820': 'NUM', 'apgar': 'VERB', '910': 'NUM', 'gs': 'NUM', 'ab': 'ADJ', 'pueperio': 'NOUN', 'transcurre': 'VERB', 'dentro': 'ADV', 'la': 'DET', 'normalidad': 'NOUN', 'permaneciendo': 'VERB', 'apiretica': 'ADJ', 'normotensa': 'PROPN', 'y': 'CCONJ', 'con': 'ADP', 'buen': 'ADJ', 'estado': 'NOUN', 'general': 'ADJ', 'lactancia': 'NOUN', 'matern

In [None]:
words_feature = []
for i in range(len(words)):
    for word in words[i]:
        words_feature.append(word)

In [79]:
data_tuples = list(zip(words_feature, tags_feature, pos_tags_feature))

data = pd.DataFrame(data_tuples, columns=['Word', 'Tag', 'POS'])
data.head(200)

Unnamed: 0,Word,Tag,POS
0,paciente,O,"[paciente, NOUN]"
1,que,O,"[que, SCONJ]"
2,ingresa,O,"[ingresa, VERB]"
3,de,O,"[de, ADP]"
4,forma,O,"[forma, NOUN]"
...,...,...,...
195,descritas,O,"[decide, VERB]"
196,previamente,O,"[alta, ADJ]"
197,moderado,O,"[domiciliaria, ADJ]"
198,residuo,O,"[sonda, NOUN]"


In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96497 entries, 0 to 96496
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Word    96497 non-null  object
 1   Tag     96497 non-null  object
 2   POS     96497 non-null  object
dtypes: object(3)
memory usage: 2.2+ MB


## **Feature Extraction**

## **MODEL**

## **MODEL EVALUATION**