# **Deep Learning Method**
### Group 1 - Detection of Negation and Uncertainty

- Marino Oliveros Blanco NIU:1668563
- Pere Mayol Carbonell NIU:1669503
- Andreu Gascón Marzo NIU:1670919
- Judith Zaragoza NIU:1634071

## **Library and Data Loading**

In [231]:
# Libraries
import json
import re
from langdetect import detect # Library for language detection
from spellchecker import SpellChecker
import spacy
import random
import numpy as np
import pandas as pd
import string
import os

In [232]:
os.chdir('C:/GitHub Repositories/NLP-Detection-of-Negation-and-Uncertainty-Project-24/Data/')

In [233]:
# Loading the training json file
loading_train = open("negacio_train_v2024.json",)
training_data = json.load(loading_train)
print("The training set contains: ", len(training_data), " samples")

The training set contains:  254  samples


In [234]:
# Loading the json object
loading_test = open("negacio_test_v2024.json")
test_data = json.load(loading_test)
print("The test set contains: ", len(test_data), " samples")

The test set contains:  64  samples


## **Data Annotation**
- Set using ground truth word to either O, NEG, UNC, NSCO, USCO

In [235]:
def tag_words_from_json(json_data):
    # Initialize lists for all tagged texts and counts
    all_tagged_texts = []
    all_counts = []

    # Iterate through each entry in the JSON data
    for entry in json_data:
        # Extract text and predictions from JSON entry
        text = entry['data']['text']
        predictions = entry['predictions'][0]['result']

        # Initialize counters
        counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}

        # Initialize tokens and their coordinates
        tokens = []
        start_pos = 0

        # Split text into tokens while tracking their start and end positions
        for word in text.split():
            start = text.find(word, start_pos)
            end = start + len(word)
            tokens.append((word, start, end))
            start_pos = end

        # Initialize tags list
        tags = ['O'] * len(tokens)

        # Tag each word
        for pred in predictions:
            pred_start = pred['value']['start']
            pred_end = pred['value']['end']
            label = pred['value']['labels'][0]

            if label in counts:
                counts[label] += 1
                for i, (word, start, end) in enumerate(tokens):
                    if start < pred_end and end > pred_start:
                        tags[i] = label

        # Combine tokens with tags
        tagged_text = [(token[0], (token[1], token[2]), tags[i]) for i, token in enumerate(tokens)]

        # Append the results to the lists
        all_tagged_texts.append(tagged_text)
        all_counts.append(counts)

    # Return the list of tagged words and counts for all entries
    return all_tagged_texts, all_counts

In [236]:
def strip_tokens(tagged_text):
    def find_indices(tokens, start_token, end_token, occurrence=1):
        start_index = None
        end_index = None
        end_token_count = 0
        for i, (word, _, _) in enumerate(tokens):
            if word == start_token and start_index is None:
                start_index = i
            if word == end_token:
                end_token_count += 1
                if end_token_count == occurrence:
                    end_index = i
                    break
        return start_index, end_index

    # Remove sections from "nº" to the second occurrence of "d'ingres"
    start1, end1 = find_indices(tagged_text, 'nº', "d'ingres", occurrence=2)
    if end1 is not None:
        end1 += 1  # Include the end token

    # If the indices are found, remove the section
    new_tagged_text = []
    if start1 is not None and end1 is not None:
        new_tagged_text = tagged_text[:start1] + tagged_text[end1:]
    else:
        new_tagged_text = tagged_text[:]

    # Continuously find and remove all sections from "nhc" to "lopd"
    while True:
        start2, end2 = find_indices(new_tagged_text, 'nhc', 'lopd')
        if start2 is not None and end2 is not None:
            end2 += 1  # Include the end token
            new_tagged_text = new_tagged_text[:start2] + new_tagged_text[end2:]
        else:
            break

    return new_tagged_text

def tag_words_from_json(json_data):
    # Initialize lists for all tagged texts and counts
    all_tagged_texts = []
    all_counts = []

    # Iterate through each entry in the JSON data
    for entry in json_data:
        # Extract text and predictions from JSON entry
        text = entry['data']['text']
        predictions = entry['predictions'][0]['result']

        # Initialize counters
        counts = {'NEG': 0, 'NSCO': 0, 'UNC': 0, 'USCO': 0}

        # Initialize tokens and their coordinates
        tokens = []
        start_pos = 0

        # Split text into tokens while tracking their start and end positions
        for word in text.split():
            start = text.find(word, start_pos)
            end = start + len(word)
            tokens.append((word, start, end))
            start_pos = end

        # Initialize tags list
        tags = ['O'] * len(tokens)

        # Tag each word
        for pred in predictions:
            pred_start = pred['value']['start']
            pred_end = pred['value']['end']
            label = pred['value']['labels'][0]

            if label in counts:
                counts[label] += 1
                for i, (word, start, end) in enumerate(tokens):
                    if start < pred_end and end > pred_start:
                        tags[i] = label

        # Combine tokens with tags
        tagged_text = [(token[0], (token[1], token[2]), tags[i]) for i, token in enumerate(tokens)]

        # Strip unwanted tokens
        stripped_tagged_text = strip_tokens(tagged_text)

        # Append the results to the lists
        all_tagged_texts.append(stripped_tagged_text)
        all_counts.append(counts)

    # Return the list of tagged words and counts for all entries
    return all_tagged_texts, all_counts

In [237]:
# Example usage with all entries
labels, counts_list = tag_words_from_json(training_data)

print("Number of entries in the entry:", len(labels))
print(labels[0]) # Print out an example entry
print(labels[0][0]) # Print out an example word
print(labels[0][0][2]) # Tag
len(labels[0])

Number of entries in the entry: 254
[('paciente', (315, 323), 'O'), ('que', (324, 327), 'O'), ('ingresa', (328, 335), 'O'), ('de', (336, 338), 'O'), ('forma', (339, 344), 'O'), ('programada', (345, 355), 'O'), ('para', (356, 360), 'O'), ('realizacion', (361, 372), 'O'), ('de', (373, 375), 'O'), ('uretrotomia', (376, 387), 'O'), ('interna', (388, 395), 'O'), ('.', (396, 397), 'O'), ('antecedents', (398, 409), 'O'), ('alergia', (410, 417), 'O'), ('a', (418, 419), 'O'), ('penicilina', (420, 430), 'O'), ('y', (431, 432), 'O'), ('cloramfenicol', (433, 446), 'O'), ('.', (447, 448), 'O'), ('no', (449, 451), 'NEG'), ('habitos', (452, 459), 'NSCO'), ('toxicos.', (460, 468), 'NSCO'), ('antecedentes', (469, 481), 'O'), ('medicos:', (482, 490), 'O'), ('bloqueo', (491, 498), 'O'), ('auriculoventricular', (499, 518), 'O'), ('de', (519, 521), 'O'), ('primer', (522, 528), 'O'), ('grado', (529, 534), 'O'), ('hipertension', (535, 547), 'O'), ('arterial.', (548, 557), 'O'), ('diverticulosis', (558, 572),

497

In [238]:
word_feature = []
scope_feature = []
tag_feature = []
maj_number = []
contains_num = []
is_number = []
num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']

def num_in_word(word):
    for thing in word:
        if thing in num:
            return 1
    return 0

def num_(word):
    if word.isdigit():
            return 1
    return 0

def proportion_number(word):
    count= 0
    for thing in word:
        if thing in num:
            count+=1
    if count > (len(word)/2):
        return 1
    return 0

for label in labels:
    for it in label:
        word_feature.append(it[0])
        scope_feature.append(it[1])
        tag_feature.append(it[2])

        word_separate = it[0].split()

        contains_num.append(num_in_word(it[0]))
        maj_number.append(proportion_number(it[0]))
        is_number.append(num_(it[0]))


In [239]:
feature_dict = {}
for i in range(len(word_feature)):
    feature_dict[word_feature[i]] = {
        'Scope': scope_feature[i],
        'Tag': tag_feature[i],
        'Maj NUMBER': maj_number[i],
        'Contains NUMBER': contains_num[i],
        'Is NUMBER': is_number[i]
    }
first_item = next(iter(feature_dict.items()))
print(first_item)

('paciente', {'Scope': (3194, 3202), 'Tag': 'O', 'Maj NUMBER': 0, 'Contains NUMBER': 0, 'Is NUMBER': 0})


In [240]:
print(f'The lenght of Word Feature is {len(word_feature)}')
print(f'The lenght of Scope Feature is {len(scope_feature)}')
print(f'The lenght of Tag Feature is {len(tag_feature)}')
print(f'The lenght of Majority Number Feature is {len(maj_number)}')
print(f'The lenght of Contains Number Feature is {len(contains_num)}')

The lenght of Word Feature is 168887
The lenght of Scope Feature is 168887
The lenght of Tag Feature is 168887
The lenght of Majority Number Feature is 168887
The lenght of Contains Number Feature is 168887


## **Feature Extraction**

Relevant features to be extracted:
- POS Tags
- NER Tags
- Dependency Parsing trees
- Coordinates
- Negation Cues and Uncertainty Cues
- Context Window????

In [241]:
!python -m spacy download es_core_news_md

Collecting es-core-news-md==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.7.0/es_core_news_md-3.7.0-py3-none-any.whl (42.3 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_md')


In [242]:

#load spacy's spanish special.
nlp = spacy.load("es_core_news_md")
#to get the POS as a feature for each word.
def get_pos_tags(words):
    text = ' '.join(words) #convert the words to text.
    doc = nlp(text) #make it a doc using the function dowloaded before from spacy.

    #Create a dictionary where the key = word and the value = POS tag.
    pos_tags_dict = {token.text: token.pos_ for token in doc} 
    lemma_tags_dict = {token.text: token.lemma_ for token in doc} 
    
    pos_tags = [pos_tags_dict.get(word, 'O') for word in words]
    lemma_ = [lemma_tags_dict.get(word, 'O') for word in words]
    #print(f'Length of Words---> {len(pos_tags_dict)}') 
    return pos_tags, lemma_

seperate_texts = []
for label in labels:
    text = []
    for it in label:
        text.append(it[0])
    seperate_texts.append(text)


In [243]:

# Create a list of texts from the labels
separate_texts = []
for label in labels:
    text = []
    for it in label:
        if isinstance(it, str):
            text.append(it)
        elif isinstance(it, tuple) and isinstance(it[0], str):
            text.append(it[0])
    separate_texts.append(text)

# Get POS tags for each text
pos_texts = []
lemma_texts = []

for text in separate_texts:
    pos_tags, lemma_tags = get_pos_tags(text)
    pos_texts.append(pos_tags)
    lemma_texts.append(lemma_tags)


# Print lengths for comparison
print(len(separate_texts[0]))  # Length of the first text in separate_texts
print(len(pos_texts[0]))  # Length of the POS tags dictionary for the first text

# Print the POS tags for the first text as a sample
print(separate_texts[1])
print(pos_texts[1])


497
497
['treball', 'de', 'part', 'antecedents', 'no', 'al·lergies', 'medicamentoses', 'conegudes.', 'no', 'intervencions', 'quirurgiques', 'ni', 'altres', 'antecedents', 'patologics.', 'nega', 'habits', 'toxics.', 'no', 'medicacio', 'habitual.', 'evolucio', 'clinica', 'evolucion', 'parto', 'finaliza', 'por', 'parto', 'eutocico', 'el', 'dia', '04/10', 'a', 'las', '9:34h', 'obtencion', 'de', 'rn', ',', 'sexo:', 'masculino', ',', 'peso:', '2820', 'apgar', ':', '9/10', 'gs:', 'ab+', 'el', 'pueperio', 'transcurre', 'dentro', 'de', 'la', 'normalidad,', 'permaneciendo', 'apiretica,', 'normotensa', 'y', 'con', 'buen', 'estado', 'general.', 'lactancia', 'materna.', 'diagnosticos', 'trabajo', 'de', 'parto', 'desgarros', 'puerperio', 'procedimientos', 'venoclisis', 'monitorizacion', 'nst', 'amniotomia', 'estimulacion', 'oxitocica', 'atencion', 'del', 'parto', 'revision', 'del', 'canal', 'blando', 'del', 'parto', 'sutura', 'de', 'desgarros', 'analgesicos', 'sueroterapia', 'analiticas', 'recomenda

In [244]:
pos_feature_pandas = []
lemma_feature_pandas = []
sentence_id = []
sentence_idx = 0  # Initialize sentence index

# Assuming pos_texts and lemma_texts are lists of lists, where each inner list represents a sentence
for pos_sentence, lemma_sentence in zip(pos_texts, lemma_texts):
    for pos, lemma in zip(pos_sentence, lemma_sentence):
        pos_feature_pandas.append(pos)
        lemma_feature_pandas.append(lemma)
        sentence_id.append(sentence_idx)
    sentence_idx += 1  # Increment sentence index after processing each sentence


In [245]:
initial_scopes = []
final_scopes = []

for start, end in scope_feature:
    initial_scopes.append(start)
    final_scopes.append(end)
    
print(f'The lenght of Word Feature is {len(word_feature)}')
print(f'The lenght of Initial Scopes Feature is {len(initial_scopes)}')
print(f'The lenght of Initial Scopes Feature is {len(final_scopes)}')
print(f'The lenght of Tag Feature is {len(tag_feature)}')
print(f'The lenght of POS Feature is {len(pos_feature_pandas)}')
print(f'The lenght of LEMMA Feature is {len(lemma_feature_pandas)}')


The lenght of Word Feature is 168887
The lenght of Initial Scopes Feature is 168887
The lenght of Initial Scopes Feature is 168887
The lenght of Tag Feature is 168887
The lenght of POS Feature is 168887
The lenght of LEMMA Feature is 168887


In [246]:
import pandas as pd
data_tuples = list(zip(word_feature, initial_scopes, final_scopes, tag_feature, pos_feature_pandas, lemma_feature_pandas, is_number,contains_num, maj_number,sentence_id))

data = pd.DataFrame(data_tuples, columns=['Word', 'Initial Scopes','Final Scopes','Tag', 'POS', 'LEMMA', 'NUMBER', 'Contains NUMBER', 'Maj NUMBER', 'text_id'])

In [247]:
from IPython.display import display

def show_data(datos, first, second):
    styled_data = datos.iloc[first:second].style.set_properties(**{'text-align': 'left'})
    display(styled_data)

show_data(datos=data, first=41, second=100)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
41,lesiones,660,668,O,NOUN,lesión,0,0,0,0
42,cutaneas,669,677,O,ADJ,cutanea,0,0,0,0
43,con,678,681,O,ADP,con,0,0,0,0
44,anestesia,682,691,O,NOUN,anestesia,0,0,0,0
45,local,692,697,O,ADJ,local,0,0,0,0
46,protesis,698,706,O,NOUN,protesis,0,0,0,0
47,total,707,712,O,ADJ,total,0,0,0,0
48,de,713,715,O,ADP,de,0,0,0,0
49,cadera,716,722,O,NOUN,cadera,0,0,0,0
50,cordectomia,723,734,O,ADJ,cordectomio,0,0,0,0


In [248]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168887 entries, 0 to 168886
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Word             168887 non-null  object
 1   Initial Scopes   168887 non-null  int64 
 2   Final Scopes     168887 non-null  int64 
 3   Tag              168887 non-null  object
 4   POS              168887 non-null  object
 5   LEMMA            168887 non-null  object
 6   NUMBER           168887 non-null  int64 
 7   Contains NUMBER  168887 non-null  int64 
 8   Maj NUMBER       168887 non-null  int64 
 9   text_id          168887 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 12.9+ MB


In [249]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 0) & (data['Contains NUMBER'] == 1)] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
456,"38ºc,",3549,3554,O,O,O,0,1,0,0
818,4gr,1670,1673,O,NUM,4gr,0,1,0,2
820,c/6hrs,1677,1683,O,PROPN,c/6hrs,0,1,0,2
880,c/8hrs,2124,2130,O,PROPN,c/8hrs,0,1,0,2
1045,x3,3274,3276,O,PROPN,x3,0,1,0,2
1056,4mg/6h,3368,3374,O,NUM,4mg/6h,0,1,0,2
1071,500mg/8h,3484,3492,O,NUM,500mg/8h,0,1,0,2
1224,1º,620,622,O,NUM,1º,0,1,0,3
1274,abril/18.,916,925,O,O,O,0,1,0,3
1293,septiembre/18,1082,1095,O,NOUN,septiembre/18,0,1,0,3


In [250]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 1) & (data['Contains NUMBER'] == 1)] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
57,81a,780,783,O,NUM,81a,0,1,1,0
122,(11/2017):,1206,1216,O,O,O,0,1,1,0
154,(250,1420,1424,O,O,O,0,1,1,0
217,(10/2017),1892,1901,O,O,O,0,1,1,0
243,24,2086,2088,O,NUM,24,1,1,1,0
247,2018,2101,2105,O,NUM,2018,1,1,1,0
335,n40.0,2741,2746,O,ADJ,n40.0,0,1,1,0
347,04.81,2839,2844,O,NUM,04.81,0,1,1,0
357,58.0,2904,2908,O,NUM,58.0,0,1,1,0
433,14,3411,3413,O,NUM,14,1,1,1,0


In [251]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 1) & (data['Contains NUMBER'] == 0)] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id


In [252]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[(data['Maj NUMBER'] == 0) & (data['Contains NUMBER'] == 0)] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
0,paciente,315,323,O,NOUN,paciente,0,0,0,0
1,que,324,327,O,SCONJ,que,0,0,0,0
2,ingresa,328,335,O,VERB,ingresar,0,0,0,0
3,de,336,338,O,ADP,de,0,0,0,0
4,forma,339,344,O,NOUN,forma,0,0,0,0
5,programada,345,355,O,ADJ,programado,0,0,0,0
6,para,356,360,O,ADP,para,0,0,0,0
7,realizacion,361,372,O,PROPN,realizacion,0,0,0,0
8,de,373,375,O,ADP,de,0,0,0,0
9,uretrotomia,376,387,O,NOUN,uretrotomia,0,0,0,0


In [253]:
# Filter the DataFrame where "Maj NUMBER" is equal to 1
filtered_df = data[data['NUMBER'] == 1] 
show_data(datos=filtered_df, first=0, second=10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
243,24,2086,2088,O,NUM,24,1,1,1,0
247,2018,2101,2105,O,NUM,2018,1,1,1,0
433,14,3411,3413,O,NUM,14,1,1,1,0
445,1,3498,3499,O,NUM,1,1,1,1,0
448,8,3507,3508,O,NUM,8,1,1,1,0
540,2820,626,630,O,NUM,2820,1,1,1,1
604,1,1163,1164,O,NUM,1,1,1,1,1
610,1,1190,1191,O,NUM,1,1,1,1,1
679,66,691,693,O,NUM,66,1,1,1,2
694,3,815,816,O,NUM,3,1,1,1,2


## **MODEL**

In [254]:
data.head(10)

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id
0,paciente,315,323,O,NOUN,paciente,0,0,0,0
1,que,324,327,O,SCONJ,que,0,0,0,0
2,ingresa,328,335,O,VERB,ingresar,0,0,0,0
3,de,336,338,O,ADP,de,0,0,0,0
4,forma,339,344,O,NOUN,forma,0,0,0,0
5,programada,345,355,O,ADJ,programado,0,0,0,0
6,para,356,360,O,ADP,para,0,0,0,0
7,realizacion,361,372,O,PROPN,realizacion,0,0,0,0
8,de,373,375,O,ADP,de,0,0,0,0
9,uretrotomia,376,387,O,NOUN,uretrotomia,0,0,0,0


In [255]:
from sklearn.preprocessing import LabelEncoder
pos = data['POS']
tag = data['Tag']

# Encoding categorical data to integers
word_encoder = LabelEncoder()
tag_encoder = LabelEncoder()

data['POS_encoded'] = word_encoder.fit_transform(pos)
data['Tag_encoded'] = tag_encoder.fit_transform(tag)

data['POS'] = data['POS_encoded']
data['Tag'] = data['Tag_encoded']

In [256]:
data.head()

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id,POS_encoded,Tag_encoded
0,paciente,315,323,2,7,paciente,0,0,0,0,7,2
1,que,324,327,2,14,que,0,0,0,0,14,2
2,ingresa,328,335,2,16,ingresar,0,0,0,0,16,2
3,de,336,338,2,1,de,0,0,0,0,1,2
4,forma,339,344,2,7,forma,0,0,0,0,7,2


In [257]:
from gensim.models import Word2Vec

words = list(data['Word'])
lema = list(data['LEMMA'])

# Train Word2Vec on words
# Optionally, train FastText for handling out-of-vocabulary words
Word2Vec_word = Word2Vec(words, vector_size=300, window=5, min_count=1, workers=4)
Word2Vec_word.save("fasttext_words.model")

Word2Vec_lemma = Word2Vec(lema, vector_size=300, window=5, min_count=1, workers=4)
Word2Vec_lemma.save("fasttext_words.model")


In [258]:
data['Word_encoded'] = Word2Vec_word
data['Lemma_encoded'] = Word2Vec_lemma

In [259]:
data.head()

Unnamed: 0,Word,Initial Scopes,Final Scopes,Tag,POS,LEMMA,NUMBER,Contains NUMBER,Maj NUMBER,text_id,POS_encoded,Tag_encoded,Word_encoded,Lemma_encoded
0,paciente,315,323,2,7,paciente,0,0,0,0,7,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"
1,que,324,327,2,14,que,0,0,0,0,14,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"
2,ingresa,328,335,2,16,ingresar,0,0,0,0,16,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"
3,de,336,338,2,1,de,0,0,0,0,1,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"
4,forma,339,344,2,7,forma,0,0,0,0,7,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"


In [260]:
data.drop(columns=['POS', 'Tag','Initial Scopes','Final Scopes','NUMBER','Contains NUMBER','Maj NUMBER', 'Word','LEMMA'], inplace = True)

In [261]:
data.head()

Unnamed: 0,text_id,POS_encoded,Tag_encoded,Word_encoded,Lemma_encoded
0,0,7,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"
1,0,14,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"
2,0,16,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"
3,0,1,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"
4,0,7,2,"Word2Vec<vocab=74, vector_size=300, alpha=0.025>","Word2Vec<vocab=72, vector_size=300, alpha=0.025>"


In [262]:
print(len(data))

168887


In [263]:
unique_pos_tags = data['POS_encoded'].unique()
print(unique_pos_tags)


[ 7 14 16  1  0 12 13  4  2  9 11  8  3  5 15  6 10 17]


In [264]:
import torch
import torch.nn as nn
import torch.optim as optim

class NLPModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_pos_tags, pos_embedding_dim, num_embeddings_lemma, lemma_embedding_dim, hidden_dim, lstm_out_dim, output_dim):
        super(NLPModel, self).__init__()
        
        # Embedding layers
        self.word_embeddings = nn.Embedding(num_embeddings, embedding_dim)  # Pre-trained, frozen
        self.pos_embeddings = nn.Embedding(num_pos_tags, pos_embedding_dim)
        self.case_embeddings = nn.Embedding(num_embeddings_lemma, lemma_embedding_dim)
        
        # Concatenation dimension
        concat_dim = embedding_dim + pos_embedding_dim + lemma_embedding_dim
        
        # Dense layer prior to LSTM
        self.dense = nn.Linear(concat_dim, hidden_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(hidden_dim, lstm_out_dim, batch_first=True, bidirectional=True)
        
        # Output layer
        self.output_layer = nn.Linear(2 * lstm_out_dim, output_dim)  # Correct input dimension

        # Dropout layer
        self.dropout = nn.Dropout(0.25)

    def forward(self, x, pos, lemma):
        # Embed each input type
        x = self.word_embeddings(x)
        pos = self.pos_embeddings(pos)
        lemma = self.case_embeddings(lemma)  # Corrected from 'case' to 'lemma'
        
        # Concatenate embeddings
        x = torch.cat((x, pos, lemma), dim=-1)
        
        # Apply dense and activation
        x = torch.tanh(self.dense(x))
        
        # Apply dropout
        x = self.dropout(x)
        
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        
        # Output layer
        output = self.output_layer(lstm_out)
        
        return output


# Hyperparameters and model instantiation

In [265]:
def train_model(dataloader, model, batch_size, criterion, optimizer, num_epochs):
    model.train()

    losses = []
    for epoch in range(num_epochs):

        for batch, (x, y) in enumerate(dataloader):
            h_state, c_state = model.init_hidden(batch_size)

            optimizer.zero_grad()

            y_pred, h_state, c_state = model(x, h_state, c_state)

            input_lengths = torch.full(size=(batch_size,), fill_value=64, dtype=torch.long)
            ##input_lengths = torch.sum((y_pred != 0), dim=1)
            ##input_lengths = torch.full(size=(batch_size,), fill_value=64, dtype=torch.long)

            target_lengths = torch.full(size=(batch_size,), fill_value=24, dtype=torch.long)
            ##target_lengths = torch.randint(high=max_str_len, size=(batch_size,), dtype=torch.long)
            ##target_lengths = torch.randint(low=1, high=max_str_len, size=(batch_size,), dtype=torch.long)
            ##target_lengths = torch.sum((y != 0), dim=1)

            y_pred = y_pred.transpose(1,0)
            loss = criterion(y_pred, y, input_lengths, target_lengths) #.transpose(1, 2)

            loss.backward()

            # Gradient clipping
            #nn.utils.clip_grad_norm_(model.parameters(), 1)
            #gradient_clipping(model, 0.0001, 1)

            for name, param in model.named_parameters():
              if param.grad is not None:
                print(name, torch.mean(param.grad).item(), loss.item())

            optimizer.step()

            if batch%30 == 0:
                print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
                losses.append(loss.item())
    return losses


In [266]:
num_embeddings = len(data)  # Vocabulary size of word embeddings
embedding_dim = 300    # Dimension of pre-trained word embeddings

num_pos_tags = len(data)      # Number of unique POS tags
pos_embedding_dim = 50 # Dimension of POS embeddings

num_embeddings_lemma = len(data)
lemma_embedding_dim = 300

#num_case_tags = 4      # Number of casing categories
#case_embedding_dim = 8 # Dimension of case embeddings
hidden_dim = 200       # Dense layer size
lstm_out_dim = 300     # LSTM output dimension
output_dim = 4  # Output dimension based on the task (e.g., number of classes)

model = NLPModel(num_embeddings, embedding_dim, num_pos_tags, pos_embedding_dim,
                 num_embeddings_lemma, lemma_embedding_dim, hidden_dim, lstm_out_dim, output_dim)

# Optimizer
optimizer = optim.Adagrad(model.parameters(), lr=0.01)

In [267]:
batch_size = 128
num_epochs = 10
losses = {"train": [], "val": []}
for epoch in range(10):
    train_loss = train_model(train_loader, model, batch_size, criterion, optimizer, num_epochs)
    val_loss = validate(criterion, model, valid_loader)

    losses["train"].append(train_loss)
    losses["val"].append(val_loss)

    plt.plot(losses["train"], label="training loss")
    plt.plot(losses["val"], label="validation loss")

    plt.legend()
    plt.pause(0.000001)
    plt.show()

## Gradients are vanishing

NameError: name 'train_loader' is not defined

## **MODEL EVALUATION**