In [3]:
import os
import time
import json
import random
import torch
import torch
from torch import optim
import torch.nn as nn
import spacy
import pandas as pd
# !spacy download pt_core_news_lg
from dataset import Data, DataBERT
from transformers import AutoTokenizer
from model import CRF, LinearLayerCRF, BERTSlotFilling
from evaluation import Evaluation
from trainer import Trainer
from random import Random
import pickle

In [4]:
# General constants
NUM_EXPERIMENTS = 1
OUTPUT_PATH = 'output_files/'
DATA_PATH = '../../data/curated_dataset_2021_10_15.csv'

# Linear Layer + CRF constants
NUM_EPOCHS = 10
BATCH = 2

# BERT constants
HIDDEN_DIM = 1024

# Doing the same with the new dataset format from WebAnno

In [5]:
def load_data_diabete(path_dir):
    nlp = spacy.load("pt_core_news_lg")
    df = pd.read_csv(path_dir)
    file_name_list = list(df['File'].unique())
    corpus, vocab_in, vocab_out = [], ['PAD', 'UNK'], []
    for file_name in file_name_list:
        print(file_name)
        file_df = df[df['File'] == file_name]
        tokens = list(file_df['Token'])
        
        # TODO get postag of each token -> I should get postag of whole text instead
#         text = ' '.join(tokens)
#         doc = nlp(text)
        postag = [nlp(token)[0].pos_ for token in tokens]
        
        # put tags in IOB2 format
        file_tags = list(file_df['Entity'])
        cur_tag = ''
        tags = []
        for file_tag in file_tags:
            # remove number at the end of file tag
            tag = file_tag.split('[')[0]
            if file_tag == '_':
                tags.append('O')
            elif file_tag != cur_tag:
                tags.append('B-' + tag)
                cur_tag = file_tag
            else:
                tags.append('I-' + tag)

        assert len(tokens) == len(postag)
        corpus.append({'tokens': tokens, 'tags': tags, 'postags': postag})
        vocab_in.extend(tokens)
        vocab_out.extend(tags)
        
    vocab_in = set(vocab_in)
    in_w2id = {w: i for i, w in enumerate(vocab_in)}
    in_id2w = {i: w for i, w in enumerate(vocab_in)}

    vocab_out = set(vocab_out)
    out_w2id = {w: i for i, w in enumerate(vocab_out)}
    out_id2w = {i: w for i, w in enumerate(vocab_out)}
    return corpus, vocab_in, in_w2id, in_id2w, vocab_out, out_w2id, out_id2w

In [6]:
corpus = load_data_diabete(DATA_PATH)

resposta_0001.txt
resposta_0002.txt
resposta_0003.txt
resposta_0004.txt
resposta_0005.txt
resposta_0006.txt
resposta_0007.txt
resposta_0008.txt
resposta_0009.txt
resposta_0010.txt
resposta_0011.txt
resposta_0012.txt
resposta_0013.txt
resposta_0014.txt
resposta_0015.txt
resposta_0016.txt
resposta_0017.txt
resposta_0018.txt
resposta_0019.txt
resposta_0020.txt
resposta_0021.txt
resposta_0022.txt
resposta_0023.txt
resposta_0024.txt
resposta_0025.txt
resposta_0026.txt
resposta_0027.txt
resposta_0028.txt
resposta_0029.txt
resposta_0030.txt
resposta_0031.txt
resposta_0032.txt
resposta_0033.txt
resposta_0034.txt
resposta_0035.txt
resposta_0036.txt
resposta_0037.txt
resposta_0038.txt
resposta_0039.txt
resposta_0040.txt
resposta_0041.txt
resposta_0042.txt
resposta_0043.txt
resposta_0044.txt
resposta_0045.txt
resposta_0046.txt
resposta_0047.txt
resposta_0048.txt
resposta_0049.txt
resposta_0050.txt
resposta_0051.txt
resposta_0052.txt
resposta_0053.txt
resposta_0054.txt
resposta_0055.txt
resposta_0

In [7]:
corpus, vocab_in, in_w2id, in_id2w, vocab_out, out_w2id, out_id2w = corpus

In [8]:
size = int(0.1 * len(corpus))
Random(42).shuffle(corpus)
dev_data = corpus[:size]
test_data = corpus[size:(2*size)]
train_data = corpus[(2*size):]
print(f'Train: {len(train_data)}')
print(f'Dev: {len(dev_data)}')
print(f'Test: {len(test_data)}')

Train: 244
Dev: 30
Test: 30


In [9]:
pickle.dump(train_data, open('../../data/train/train.pkl', 'wb'))
pickle.dump(dev_data, open('../../data/dev/dev.pkl', 'wb'))
pickle.dump(test_data, open('../../data/test/test.pkl', 'wb'))

# Run models

In [None]:
################################################
#           CRF
################################################
time_str = time.strftime("%Y_%m_%d-%H:%M:%S")
crf_output_folder = OUTPUT_PATH + ('crf_%s' % time_str) + '/'
if not os.path.exists(crf_output_folder):
    os.makedirs(crf_output_folder)
data_info = Data(DATA_PATH)
crf = CRF()
evaluation = Evaluation(crf_output_folder)

print("Evaluating CRF:")
micro_avg_f1 = 0.0
y_true = y_pred = test_tokens = []
for num_experiment in range(NUM_EXPERIMENTS):
    x_train, y_train, x_test, y_true, test_tokens = crf.get_train_test_data(data_info)
    crf.fit(x_train, y_train)
    y_pred = crf.predict(x_test)
    print(x_test)
    print(y_true)
    micro_avg_f1 += evaluation.evaluate(num_experiment, y_true, y_pred)
micro_avg_f1 /= NUM_EXPERIMENTS
print()
print('\tMicro average F1: %.2f' % micro_avg_f1)

evaluation.generate_output_csv('crf_output', y_true, y_pred, test_tokens)

In [None]:
################################################
#           Linear Layer + CRF
################################################
time_str = time.strftime("%Y_%m_%d-%H:%M:%S")
linear_layer_crf_output_folder = OUTPUT_PATH + ('linear_layer_crf_%s' % time_str) + '/'
if not os.path.exists(linear_layer_crf_output_folder):
    os.makedirs(linear_layer_crf_output_folder)

data_info = Data(DATA_PATH)
train_data, test_data = data_info.fit()
vocab_size = len(data_info.vocab_in)
num_classes = len(data_info.vocab_out)

model = LinearLayerCRF(num_classes, vocab_size, data_info.out_w2id)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

trainer = Trainer(model, BATCH, is_bert=False)
evaluation = Evaluation(linear_layer_crf_output_folder)

micro_avg_f1 = 0.0
y_true_text = y_pred_text = test_tokens = []
for num_experiment in range(NUM_EXPERIMENTS):
    y_true, y_pred = trainer.test(test_data)
    for epoch in range(1, NUM_EPOCHS + 1):
        trainer.train(train_data, optimizer, epoch)
        y_true, y_pred = trainer.test(test_data)

    # get test tokens and convert output from number to text
    test_tokens = [info[-1] for info in test_data]
    y_true_text = evaluation.convert_output_to_text(y_true, data_info.out_id2w)
    y_pred_text = evaluation.convert_output_to_text(y_pred, data_info.out_id2w)

    micro_avg_f1 += evaluation.evaluate(num_experiment, y_true_text, y_pred_text)

micro_avg_f1 /= NUM_EXPERIMENTS
print()
print('Micro avg F1: %.2f' % micro_avg_f1)
evaluation.generate_output_csv('linear_layer_output', y_true_text, y_pred_text, test_tokens)

In [8]:
################################################
#           BERT
################################################
time_str = time.strftime("%Y_%m_%d-%H:%M:%S")
bert_output_folder = OUTPUT_PATH + ('bert_%s' % time_str) + '/'
if not os.path.exists(bert_output_folder):
    os.makedirs(bert_output_folder)

data_info = DataBERT(DATA_PATH)
train_data, test_data = data_info.fit()

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')
num_classes = len(data_info.vocab_out)
model = BERTSlotFilling(HIDDEN_DIM, num_classes, device=device)
model.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = optim.Adam(optimizer_grouped_parameters, lr=1e-5)
weights = [1.] * num_classes
weights[data_info.out_w2id['O']] = 0.01
weights = torch.tensor(weights).to(device)
criterion = nn.CrossEntropyLoss(weight=weights)

evaluation = Evaluation(bert_output_folder)
trainer = Trainer(model, BATCH, is_bert=True, criterion=criterion, device=device)

micro_avg_f1 = 0.0
y_true_text = y_pred_text = test_tokens = []
for num_experiment in range(NUM_EXPERIMENTS):
    y_true, y_pred = trainer.test(test_data)
    for epoch in range(1, NUM_EPOCHS + 1):
        trainer.train(train_data, optimizer, epoch)
        y_true, y_pred = trainer.test(test_data)

    # get test tokens and convert output from number to text
    test_tokens = [info[-1] for info in test_data]
    y_true_text = evaluation.convert_output_to_text(y_true, data_info.out_id2w)
    y_pred_text = evaluation.convert_output_to_text(y_pred, data_info.out_id2w)

    micro_avg_f1 += evaluation.evaluate(num_experiment, y_true_text, y_pred_text)

micro_avg_f1 /= NUM_EXPERIMENTS
print()
print('Micro avg F1: %.2f' % micro_avg_f1)
evaluation.generate_output_csv('bert_output', y_true_text, y_pred_text, test_tokens)

Some weights of the model checkpoint at neuralmind/bert-large-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train Epoch: 1 	Loss: 3.278683
Train Epoch: 2 	Loss: 3.131175
Train Epoch: 3 	Loss: 2.919506
Train Epoch: 4 	Loss: 2.833187
Train Epoch: 5 	Loss: 2.775946
Train Epoch: 6 	Loss: 2.743813
Train Epoch: 7 	Loss: 2.714914
Train Epoch: 8 	Loss: 2.678570
Train Epoch: 9 	Loss: 2.654851
Train Epoch: 10 	Loss: 2.646982
                     precision    recall  f1-score   support

       Complication       0.49      0.80      0.61        45
       DiabetesType       0.00      0.00      0.00         8
           Duration       0.00      0.00      0.00         1
               Food       0.36      0.82      0.50        11
       GlucoseValue       0.14      0.22      0.17        27
            Insulin       0.00      0.00      0.00         7
         Medication       0.14      0.60      0.23         5
NonMedicalTreatment       0.22      0.83      0.35        18
                Set       0.00      0.00      0.00         1
            Symptom       0.32      0.31      0.32        39
               Te

  _warn_prf(average, modifier, msg_start, len(result))
