# **Deep Learning Method**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- Pere Mayol Carbonell NIU:1669503
- Andreu Gascón Marzo NIU:1670919
- Judith Zaragoza NIU:1634071

## **Library and Data Loading**

In [1]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import random
import string

In [2]:
# Loading the training json file
loading_train = open("negacio_train_v2024.json",)
training_data = json.load(loading_train)
print("The training set contains: ", len(training_data), " samples")

The training set contains:  254  samples


In [3]:
# Loading the json object
loading_test = open("negacio_test_v2024.json")
test_data = json.load(loading_test)
print("The test set contains: ", len(test_data), " samples")

The test set contains:  64  samples


In [4]:
def remove_pacient_info(text):
    # Remove lines starting with "nº historia clinica:" and ending with "motiu d'ingres"
    text = re.sub(r'nº historia clinica:.*?motiu d\'ingres', '', text, flags=re.DOTALL)
    # Remove lines starting with "nhc" and ending with "lopd"
    text = re.sub(r'nhc.*?lopd', '', text, flags=re.DOTALL)
    # Remove all asterisks '*'
    text = text.replace('*', '')
    return text


# 2 Remove Punctuation (Able to be turned ON/OFF)
def remove_punctuation(text):
    # Define a translation table to map punctuation to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text

## **Data Annotation**
- Set using ground truth word to either O, NEG, UNC, NSCO, USCO

In [5]:
def tag_words_from_json(json_data):
    # Extract text and predictions from JSON data
    text = json_data[0]['data']['text']
    predictions = json_data[0]['predictions'][0]['result']
    text_info = remove_pacient_info(text)
    text_pun = remove_punctuation(text_info)
    # Tokenize text
    tokens = text_pun.split()

    # Initialize counters
    counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}

    # Initialize tags list
    tags = ['O'] * len(tokens)

    # Tag each word
    for pred in predictions:
        start = pred['value']['start']
        end = pred['value']['end']
        label = pred['value']['labels'][0]
        
        if label in counts:
            counts[label] += 1
            for i in range(len(tokens)):
                if start <= len(' '.join(tokens[:i+1])) < end:
                    tags[i] = label

    # Print tokenized text
    #print("Tokenized Text:")
    #print(tokens)

    # Return tagged words and counts
    return tags, counts, tokens

In [6]:
# EXAMPLE USAGE
# Tag the words from the training data
tags, counts, words = tag_words_from_json(training_data)
print("Tagged Words:")
print(tags)
print("Words:")
print(words)
print("\nCounts:")
print(counts)
print('\n-----------------------------------')
print(f'Lenght of Taggs---> {len(tags)} \nLenght of Words---> {len(words)}')

Tagged Words:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NSCO', 'NSCO', 'NSCO', 'NSCO', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NEG', 'NSCO', 'NSCO', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NEG', 'NSCO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [7]:
features = {}
for i in range(len(tags)):
    features[words[i]] = tags[i]

In [8]:
print(features)

{'paciente': 'O', 'que': 'O', 'ingresa': 'O', 'de': 'O', 'forma': 'O', 'programada': 'O', 'para': 'O', 'realizacion': 'O', 'uretrotomia': 'O', 'interna': 'O', 'antecedents': 'O', 'alergia': 'O', 'a': 'O', 'penicilina': 'O', 'y': 'O', 'cloramfenicol': 'O', 'no': 'UNC', 'habitos': 'O', 'toxicos': 'O', 'antecedentes': 'O', 'medicos': 'O', 'bloqueo': 'O', 'auriculoventricular': 'O', 'primer': 'O', 'grado': 'O', 'hipertension': 'O', 'arterial': 'O', 'diverticulosis': 'O', 'extensa': 'O', 'insuficiencia': 'O', 'renal': 'O', 'cronica': 'O', 'colelitiasis': 'O', 'quirurgicos': 'O', 'exeresis': 'O', 'lesiones': 'O', 'cutaneas': 'O', 'con': 'O', 'anestesia': 'O', 'local': 'O', 'protesis': 'O', 'total': 'O', 'cadera': 'O', 'cordectomia': 'O', 'herniorrafia': 'O', 'inguinal': 'O', 'proces': 'O', 'actual': 'O', 'varon': 'NSCO', '81a': 'NSCO', 'raiz': 'O', 'episodio': 'O', 'hematuria': 'O', 'macroscopica': 'O', 'se': 'O', 'realiza': 'O', 'cistoscopia': 'O', 'es': 'O', 'negativa': 'O', 'malignas': 'O

In [9]:
word_feature = []
tag_feature = []

for word, tag in features.items():
    word_feature.append(word)
    tag_feature.append(tag)

import pandas as pd 

data_tuples = list(zip(word_feature, tag_feature))

data = pd.DataFrame(data_tuples, columns=['Word', 'Tag'])

In [10]:
data.head(100)

Unnamed: 0,Word,Tag
0,paciente,O
1,que,O
2,ingresa,O
3,de,O
4,forma,O
...,...,...
95,anterior,O
96,dos,O
97,focales,O
98,peneana,O


In [11]:
!pip install spacy
import spacy

# Load the Spanish language model
nlp = spacy.load("es_core_news_md")

def get_pos_tag(words):
    """
    Given a list of words in Spanish, returns a list of POS tags for each word.
    
    Args:
    words (list of str): The words to be analyzed.

    Returns:
    list of tuples: A list of tuples where each tuple contains a word and its corresponding POS tag.
    """
    # Join the words into a single string for processing
    text = ' '.join(words)
    doc = nlp(text)
    
    # Extract POS tags for each word
    pos_tags = [(token.text, token.pos_) for token in doc]
    
    return pos_tags
pos_tags = get_pos_tag(words)
print(pos_tags)


[('paciente', 'ADJ'), ('que', 'SCONJ'), ('ingresa', 'VERB'), ('de', 'ADP'), ('forma', 'NOUN'), ('programada', 'ADJ'), ('para', 'ADP'), ('realizacion', 'PROPN'), ('de', 'ADP'), ('uretrotomia', 'NOUN'), ('interna', 'ADJ'), ('antecedents', 'NOUN'), ('alergia', 'VERB'), ('a', 'ADP'), ('penicilina', 'NOUN'), ('y', 'CCONJ'), ('cloramfenicol', 'ADJ'), ('no', 'ADV'), ('habitos', 'NOUN'), ('toxicos', 'ADJ'), ('antecedentes', 'NOUN'), ('medicos', 'ADJ'), ('bloqueo', 'PROPN'), ('auriculoventricular', 'PROPN'), ('de', 'ADP'), ('primer', 'ADJ'), ('grado', 'NOUN'), ('hipertension', 'ADJ'), ('arterial', 'ADJ'), ('diverticulosis', 'NOUN'), ('extensa', 'ADJ'), ('insuficiencia', 'NOUN'), ('renal', 'ADJ'), ('cronica', 'ADJ'), ('colelitiasis', 'NOUN'), ('antecedentes', 'NOUN'), ('quirurgicos', 'ADJ'), ('exeresis', 'NOUN'), ('de', 'ADP'), ('lesiones', 'NOUN'), ('cutaneas', 'ADJ'), ('con', 'ADP'), ('anestesia', 'NOUN'), ('local', 'ADJ'), ('protesis', 'NOUN'), ('total', 'ADJ'), ('de', 'ADP'), ('cadera', 'NOU

In [12]:
'''import spacy

# Load SpaCy's Spanish language model
nlp = spacy.load("es_core_news_md")

# Function to get part-of-speech tags for a given list of words
def get_pos_tags(words):
    pos_tags_texts = []
    for token in words:
        doc = nlp(token)
        pos_tag = doc[0].pos_
        pos_tags_texts.append(pos_tag)
    print(f'Length of Words---> {len(pos_tags_texts)}')
    return pos_tags_texts
pos_tags_texts = get_pos_tags(words=words)'''

'import spacy\n\n# Load SpaCy\'s Spanish language model\nnlp = spacy.load("es_core_news_md")\n\n# Function to get part-of-speech tags for a given list of words\ndef get_pos_tags(words):\n    pos_tags_texts = []\n    for token in words:\n        doc = nlp(token)\n        pos_tag = doc[0].pos_\n        pos_tags_texts.append(pos_tag)\n    print(f\'Length of Words---> {len(pos_tags_texts)}\')\n    return pos_tags_texts\npos_tags_texts = get_pos_tags(words=words)'

In [14]:
data_tuples = list(zip(word_feature, tag_feature, pos_tags))

data = pd.DataFrame(data_tuples, columns=['Word', 'Tag', 'POS'])
data.head(50)

Unnamed: 0,Word,Tag,POS
0,paciente,O,"(paciente, ADJ)"
1,que,O,"(que, SCONJ)"
2,ingresa,O,"(ingresa, VERB)"
3,de,O,"(de, ADP)"
4,forma,O,"(forma, NOUN)"
5,programada,O,"(programada, ADJ)"
6,para,O,"(para, ADP)"
7,realizacion,O,"(realizacion, PROPN)"
8,uretrotomia,O,"(de, ADP)"
9,interna,O,"(uretrotomia, NOUN)"


## **Feature Extraction**

## **MODEL**

## **MODEL EVALUATION**