# Assignment 2 - Named Entity Recognition (NER) in Spanish

- **Nombre:** Mario Vicuña y Miguel Videla

- **Usuario o nombre de equipo en Codalab:** TeamChalla

In [None]:
!pip install transformers -q
!pip3 install --upgrade torchtext -q

[K     |████████████████████████████████| 778kB 9.6MB/s 
[K     |████████████████████████████████| 1.1MB 35.1MB/s 
[K     |████████████████████████████████| 3.0MB 56.1MB/s 
[K     |████████████████████████████████| 890kB 52.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 71kB 4.5MB/s 
[?25h

In [None]:
import os
import shutil
import torch
import warnings
import sklearn.exceptions
import numpy as np
from tqdm.notebook import tqdm
from torchtext import data, datasets
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, precision_score, recall_score

SEED = 123
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
torch.cuda.get_device_name(0)

Using TensorFlow backend.


'Tesla P100-PCIE-16GB'

In [None]:
%%capture
!wget https://github.com/dccuchile/CC6205/releases/download/Data/train_NER_esp.txt -nc 
!wget https://github.com/dccuchile/CC6205/releases/download/Data/val_NER_esp.txt -nc 
!wget https://github.com/dccuchile/CC6205/releases/download/Data/test_NER_esp.txt -nc 

In [None]:
# Utils
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label])
        if n_subwords > 1:
            labels.extend(["<sub>"] * (n_subwords - 1))
    return tokenized_sentence, labels

def ner_preprocess(text, ner_tags, tokenizer, max_len=100, batch_size=32, is_train=True):
    tokenized_texts_and_labels = [
        tokenize_and_preserve_labels(sent, labs, tokenizer)
        for sent, labs in zip(text, ner_tags)
    ]
    tokenized_texts = [["[CLS]"] + token_label_pair[0] + ["[SEP]"] for token_label_pair in tokenized_texts_and_labels]
    labels = [["O"] + token_label_pair[1] + ["O"] for token_label_pair in tokenized_texts_and_labels]
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", value=tokenizer.vocab["[PAD]"],
                          truncating="post", padding="post")
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                        maxlen=max_len, value=tag2idx["<pad>"], padding="post",
                        dtype="long", truncating="post")
    attention_masks = [[float(i != tokenizer.vocab["[PAD]"]) for i in ii] for ii in input_ids]
    pad_tok = tokenizer.vocab["[PAD]"]
    sep_tok = tokenizer.vocab["[SEP]"]
    o_lab = tag2idx["O"]
    for voc_ids, tag_ids in zip(input_ids, tags):
        if voc_ids[-1] == pad_tok:
            continue
        else:
            voc_ids[-1] = sep_tok
            tag_ids[-1] = o_lab
    tr_inputs = torch.tensor(input_ids)
    tr_tags = torch.tensor(tags)
    tr_masks = torch.tensor(attention_masks)
    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
    if is_train:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = SequentialSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return train_dataloader

def ner_loss(logits, labels, attention_mask, sub_idx, num_labels):
    loss_fct = CrossEntropyLoss()
    active_loss_subwords = labels.view(-1) != sub_idx
    active_loss_mask = attention_mask.view(-1) == 1
    active_loss = active_loss_mask & active_loss_subwords
    active_logits = logits.view(-1, num_labels)
    active_labels = torch.where(
        active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
    )
    loss = loss_fct(active_logits, active_labels)
    return loss

def calculate_metrics(preds, y_true, inputs, o_idx=0, pad_idx=9, sub_idx=10):
    y_pred = preds.argmax(axis=2).flatten()
    y_true = y_true.flatten()
    mask = (y_true != o_idx) & (y_true != pad_idx) & (y_true != sub_idx)
    y_pred = y_pred[mask]
    y_true = y_true[mask]
    f1 = f1_score(y_true, y_pred, average='macro')
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return precision, recall, f1

def set_unk(data):
  for i in range(len(data)):
    for j in range(len(data[i])):
      if data[i][j] == "<unk>":
        data[i][j] = "[UNK]"
  return data

In [None]:
# Data Load
TEXT = data.Field(lower=False) 
NER_TAGS = data.Field(unk_token=None)
fields = (("text", TEXT), ("nertags", NER_TAGS))

train_data, valid_data, test_data = datasets.SequenceTaggingDataset.splits(
    path="./",
    train="train_NER_esp.txt",
    validation="val_NER_esp.txt",
    test="test_NER_esp.txt",
    fields=fields,
    encoding="iso-8859-1",
    separator=" "
)

tag_values = ['O', 'B-ORG', 'I-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-MISC', 'B-MISC', 'I-LOC', '<pad>',  '<sub>']
tag2idx = {t: i for i, t in enumerate(tag_values)}
tag2idx

{'<pad>': 9,
 '<sub>': 10,
 'B-LOC': 3,
 'B-MISC': 7,
 'B-ORG': 1,
 'B-PER': 4,
 'I-LOC': 8,
 'I-MISC': 6,
 'I-ORG': 2,
 'I-PER': 5,
 'O': 0}

In [None]:
# Data Preprocessing
MAX_LEN = 512
BATCH_SIZE = 16

inputs_train = set_unk([example.text for example in train_data.examples])
labels_train = [example.nertags for example in train_data.examples]
inputs_val = set_unk([example.text for example in valid_data.examples])
labels_val = [example.nertags for example in valid_data.examples]
inputs_test = set_unk([example.text for example in test_data.examples])
labels_test = [example.nertags for example in test_data.examples]

bert_tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', do_lower_case=False)
#train_dataloader = ner_preprocess(inputs_train, labels_train, tokenizer=bert_tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
train_dataloader = ner_preprocess(inputs_train + inputs_val, labels_train + labels_val, tokenizer=bert_tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
#valid_dataloader = ner_preprocess(inputs_val, labels_val, tokenizer=bert_tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE, is_train=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242120.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=43.0, style=ProgressStyle(description_w…




In [None]:
# Model
model = BertForTokenClassification.from_pretrained(
    "dccuchile/bert-base-spanish-wwm-cased",
    num_labels=len(tag2idx) - 2,
    output_attentions=False,
    output_hidden_states=False
)
model.cuda()

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

epochs = 10
max_grad_norm = 1.0
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=441944381.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly

In [None]:
# Train loop
VALIDATE = False
for i in range(epochs):
    model.train()
    train_loss = 0; train_pr = 0; train_rec = 0; train_f1 = 0
    pbar = tqdm(total=len(train_dataloader))
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
        loss = ner_loss(logits, b_labels, b_input_mask, tag2idx["<sub>"], model.config.num_labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()
        precision, recall, f1 = calculate_metrics(logits.detach().cpu().numpy(), b_labels.detach().cpu().numpy(), b_input_ids.detach().cpu().numpy())
        train_loss += loss.item(); train_pr += precision; train_rec += recall; train_f1 += f1
        pbar.update(1)
    pbar.close()
    train_loss /= len(train_dataloader); train_pr /= len(train_dataloader)
    train_rec /= len(train_dataloader); train_f1 /= len(train_dataloader) 
    print("Train Epoch {}\nLoss: {} | Precision: {} | Recall: {} | F1: {}".format(i+1, train_loss, train_pr, train_rec, train_f1))
    if VALIDATE:
        model.eval()
        val_loss = 0; val_pr = 0; val_rec = 0; val_f1 = 0
        predictions , true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
                loss = ner_loss(logits, b_labels, b_input_mask, tag2idx["<sub>"], model.config.num_labels)
            precision, recall, f1 = calculate_metrics(logits.detach().cpu().numpy(), b_labels.detach().cpu().numpy(), b_input_ids.detach().cpu().numpy())
            val_loss += loss.item(); val_pr += precision; val_rec += recall; val_f1 += f1
        val_loss /= len(valid_dataloader); val_pr /= len(valid_dataloader)
        val_rec /= len(valid_dataloader); val_f1 /= len(valid_dataloader) 
        print("Validation Epoch {}\nLoss: {} | Precision: {} | Recall: {} | F1: {}".format(i+1, val_loss, val_pr, val_rec, val_f1))

HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 1
Loss: 0.09434445534061524 | Precision: 0.7523235682286378 | Recall: 0.7244104137370487 | F1: 0.7219966356411536


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 2
Loss: 0.03390622173992597 | Precision: 0.8767971213013036 | Recall: 0.8582485647421727 | F1: 0.8591718740545049


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 3
Loss: 0.020380207003563557 | Precision: 0.9147022935944011 | Recall: 0.9039781701474965 | F1: 0.9043037439506161


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 4
Loss: 0.012612122387236014 | Precision: 0.9482234056243506 | Recall: 0.9409273121675431 | F1: 0.9416032074716452


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 5
Loss: 0.009633440436755337 | Precision: 0.9618477842058498 | Recall: 0.9561781016813722 | F1: 0.9565209119693987


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 6
Loss: 0.006876613855388314 | Precision: 0.9726276241234588 | Recall: 0.9698155181437853 | F1: 0.9695152035981301


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 7
Loss: 0.005193366799630894 | Precision: 0.9775332222945661 | Recall: 0.975202721077997 | F1: 0.974734614483445


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 8
Loss: 0.0040791436070094274 | Precision: 0.9787714959462697 | Recall: 0.9771097573776222 | F1: 0.9763589480719121


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 9
Loss: 0.003198419363843641 | Precision: 0.9822195756297877 | Recall: 0.9810035889021076 | F1: 0.9802928331132665


HBox(children=(FloatProgress(value=0.0, max=640.0), HTML(value='')))


Train Epoch 10
Loss: 0.0023902704910597093 | Precision: 0.9853252622675928 | Recall: 0.9840865986792314 | F1: 0.9838134294536605


In [None]:
# Test prediction
predictions = []
for test_sentence in tqdm(inputs_test):
    tokenized_sentence = bert_tokenizer.encode(test_sentence, is_pretokenized=True)
    input_ids = torch.tensor([tokenized_sentence]).cuda()
    with torch.no_grad():
        output = model(input_ids)
    label_indices = np.argmax(output[0].cpu().numpy(), axis=2)[0]
    idx_number = 1
    for word in test_sentence:
        if word == '[UNK]':
            word_sub = '<unk>'
        else:
            word_sub = word
        predictions.append([word_sub, tag_values[label_indices[idx_number]]])
        idx_number += len(bert_tokenizer.tokenize(word))
print(len(predictions))

HBox(children=(FloatProgress(value=0.0, max=1517.0), HTML(value='')))


51533


In [None]:
# Generate submission file
if (os.path.isfile('./predictions.zip')):
    os.remove('./predictions.zip')

if (not os.path.isdir('./predictions')):
    os.mkdir('./predictions')
else:
    shutil.rmtree('./predictions')
    os.mkdir('./predictions')

f = open('predictions/predictions.txt', 'w')
for word, tag in predictions:
    f.write(word + ' ' + tag + '\n')
f.write('\n')
f.close()

a = shutil.make_archive('predictions', 'zip', './predictions')