# **Deep Learning Method**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- Pere Mayol Carbonell NIU:1669503
- Andreu Gascón Marzo NIU:1670919
- Judith Zaragoza NIU:1634071

## **Library and Data Loading**

In [99]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import random
import string

import pandas as pd 


In [100]:
# Loading the training json file
loading_train = open("negacio_train_v2024.json",)
training_data = json.load(loading_train)
print("The training set contains: ", len(training_data), " samples")

The training set contains:  254  samples


In [101]:
# Loading the json object
loading_test = open("negacio_test_v2024.json")
test_data = json.load(loading_test)
print("The test set contains: ", len(test_data), " samples")

The test set contains:  64  samples


In [102]:
def remove_pacient_info(text):
    # Remove lines starting with "nº historia clinica:" and ending with "motiu d'ingres"
    text = re.sub(r'nº historia clinica:.*?motiu d\'ingres', '', text, flags=re.DOTALL)
    # Remove lines starting with "nhc" and ending with "lopd"
    text = re.sub(r'nhc.*?lopd', '', text, flags=re.DOTALL)
    # Remove all asterisks '*'
    text = text.replace('*', '')
    return text


# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(text):
    # Define a translation table to map punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text

## **Data Annotation**
- Set using ground truth word to either O, NEG, UNC, NSCO, USCO

In [103]:
def tag_words_from_json(json_data):
    # Extract text and predictions from JSON data
    text = json_data[0]['data']['text']
    predictions = json_data[0]['predictions'][0]['result']
    text_info = remove_pacient_info(text)
    text_pun = remove_punctuation(text_info)
    # Tokenize text
    tokens = text_pun.split()

    # Initialize counters
    counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}

    # Initialize tags list
    tags = ['O'] * len(tokens)

    # Tag each word
    for pred in predictions:
        start = pred['value']['start']
        end = pred['value']['end']
        label = pred['value']['labels'][0]
        
        if label in counts:
            counts[label] += 1
            for i in range(len(tokens)):
                if start <= len(' '.join(tokens[:i+1])) < end:
                    tags[i] = label

    # Return tagged words and counts
    return tags, counts, tokens

In [104]:
# EXAMPLE USAGE
# Tag the words from the training data
tags, counts, words = tag_words_from_json(training_data)
print("Tagged Words:")
print(tags)
print("Words:")
print(words)
print("\nCounts:")
print(counts)
print('\n-----------------------------------')
print(f'Lenght of Taggs---> {len(tags)} \nLenght of Words---> {len(words)}')

Tagged Words:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NSCO', 'NSCO', 'NSCO', 'NSCO', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NEG', 'NSCO', 'NSCO', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NEG', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [105]:
features = {}
for i in range(len(tags)):
    features[words[i]] = tags[i]

In [106]:
print(features)

{'paciente': 'O', 'que': 'O', 'ingresa': 'O', 'de': 'O', 'forma': 'O', 'programada': 'O', 'para': 'O', 'realizacion': 'O', 'uretrotomia': 'O', 'interna': 'O', 'antecedents': 'O', 'alergia': 'O', 'a': 'O', 'penicilina': 'O', 'y': 'O', 'cloramfenicol': 'O', 'no': 'UNC', 'habitos': 'O', 'toxicos': 'O', 'antecedentes': 'O', 'medicos': 'O', 'bloqueo': 'O', 'auriculoventricular': 'O', 'primer': 'O', 'grado': 'O', 'hipertension': 'O', 'arterial': 'O', 'diverticulosis': 'O', 'extensa': 'O', 'insuficiencia': 'O', 'renal': 'O', 'cronica': 'O', 'colelitiasis': 'O', 'quirurgicos': 'O', 'exeresis': 'O', 'lesiones': 'O', 'cutaneas': 'O', 'con': 'O', 'anestesia': 'O', 'local': 'O', 'protesis': 'O', 'total': 'O', 'cadera': 'O', 'cordectomia': 'O', 'herniorrafia': 'O', 'inguinal': 'O', 'proces': 'O', 'actual': 'O', 'varon': 'NSCO', '81a': 'NSCO', 'raiz': 'O', 'episodio': 'O', 'hematuria': 'O', 'macroscopica': 'O', 'se': 'O', 'realiza': 'O', 'cistoscopia': 'O', 'es': 'O', 'negativa': 'O', 'malignas': 'O

In [107]:
word_feature = []
tag_feature = []

for word, tag in features.items():
    word_feature.append(word)
    tag_feature.append(tag)

In [108]:
#load spacy's spanish special.
nlp = spacy.load("es_core_news_md")
#to get the POS as a feature for each word.
def get_pos_tags(words):
    text = ' '.join(words) #convert the words to text.
    doc = nlp(text) #make it a doc using the function dowloaded before from spacy.

    #Create a dictionary where the key = word and the value = POS tag.
    pos_tags_dict = {token.text: token.pos_ for token in doc} 
    
    print(f'Length of Words---> {len(pos_tags_dict)}') 
    return pos_tags_dict


pos_tags_dict = get_pos_tags(words=words)

pos_tags_feature = []
for word, pos in pos_tags_dict.items():
    pos_tags_feature.append(pos)


Length of Words---> 293


In [109]:
data_tuples = list(zip(word_feature, tag_feature, pos_tags_feature))

data = pd.DataFrame(data_tuples, columns=['Word', 'Tag', 'POS'])
data.head(10)

Unnamed: 0,Word,Tag,POS
0,paciente,O,NOUN
1,que,O,SCONJ
2,ingresa,O,VERB
3,de,O,ADP
4,forma,O,NOUN
5,programada,O,ADJ
6,para,O,ADP
7,realizacion,O,PROPN
8,uretrotomia,O,ADJ
9,interna,O,ADJ


In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Word    292 non-null    object
 1   Tag     292 non-null    object
 2   POS     292 non-null    object
dtypes: object(3)
memory usage: 7.0+ KB


## **Feature Extraction**

## **MODEL**

## **MODEL EVALUATION**