In [1]:
import csv
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

Using TensorFlow backend.


In [2]:
DS = pd.read_csv("DataAnnotatedSubset150SentenceLength.csv")

In [3]:
DS.head()

Unnamed: 0,text_ID,word,tag
0,46809,RR,object
1,46809,130/80,value
2,46809,mmHg.,
3,47482,"RR142/89mmHg,",object
4,47482,HR,


In [4]:
DS['tag'].fillna('unk', inplace=True)

In [5]:
words = list(set(DS["word"].values))
n_words = len(words)
n_words

27741

In [6]:
tags = list(set(DS["tag"].values))
n_tags = len(tags); n_tags


3

In [7]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("text_ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [8]:
getter = SentenceGetter(DS)

In [9]:
sentences =[" ".join([s[0] for s in sent]) for sent in getter.sentences]


In [37]:
sentences[10]

'Kokkuvõte: SKG valem: 1D A(211) C(31) OM(2) D(201) PTCA+BMS C(31)-(11)'

In [11]:
labels = [[s[1] for s in sent] for sent in getter.sentences]
print(labels[0])


['unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'object', 'value', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk']


In [12]:
tags_vals = list(set(DS["tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}


In [13]:
tag2idx

{'object': 0, 'value': 1, 'unk': 2}

In [14]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

In [15]:
MAX_LEN = 75
bs = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()


In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [17]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

['26', '.', '05', '.', '09', 'pats', '##ient', 'hospital', '##ise', '##eritud', 'er', '##üt', '##rots', '##ü', '##üt', '##ide', 'üle', '##kan', '##dek', '##s', ',', '2', 'do', '##osi', 'RR', '120', '/', '85', 'temperatuur', '37', '.', '7', '.', 'V', '##õe', '##tud', 'vere', '##ana', '##lü', '##üs', '##id', '.', 'En', '##ese', '##tun', '##ne', 'ra', '##huld', '##av', '.']


In [18]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (530 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (533 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (520 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (560 > 512). Running this sequence through BERT will result in indexing errors


In [19]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["unk"], padding="post",
                     dtype="long", truncating="post")


In [20]:
tag2idx

{'object': 0, 'value': 1, 'unk': 2}

In [21]:
print(input_ids[1])

[56445   119 63164 16107   124 49288 50544 10171   117 13675 10251   176
 57747   119   142 18089 23388 16107 12367 58240 49288 10123   119 80993
 16129   120 10709 10366 12396 10240   119   157 90612 10306   119 10150
   119 11035   119     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]


#the mask to ignore the padded elements in the sequences.

In [22]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]


In [23]:
print(attention_masks[1])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [24]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)


#### Since we’re operating in pytorch, we have to convert the dataset to torch tensors.



In [25]:
tr_inputs = torch.tensor(tr_inputs).to(torch.int64)
val_inputs = torch.tensor(val_inputs).to(torch.int64)
tr_tags = torch.tensor(tr_tags).to(torch.int64)
val_tags = torch.tensor(val_tags).to(torch.int64)
tr_masks = torch.tensor(tr_masks).to(torch.int64)
val_masks = torch.tensor(val_masks).to(torch.int64)


In [26]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


In [27]:
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(tag2idx))

In [28]:
# model.cuda();

In [29]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)


In [30]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [31]:
epochs = 5
max_grad_norm = 1.0
from tqdm import tqdm, trange


for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 0.1014579097003055
Validation loss: 0.07721738136091898
Validation Accuracy: 0.9599921557770393


Epoch:  20%|██        | 1/5 [1:23:52<5:35:28, 5032.02s/it]

F1-Score: 0.2275873169161436
Train loss: 0.06545691466211186
Validation loss: 0.0650745881937964
Validation Accuracy: 0.9613787375415279


Epoch:  40%|████      | 2/5 [2:47:20<4:11:14, 5024.93s/it]

F1-Score: 0.26681709208856297
Train loss: 0.05557562540100965
Validation loss: 0.06125682284838931
Validation Accuracy: 0.9595574935400517


Epoch:  60%|██████    | 3/5 [4:11:06<2:47:30, 5025.29s/it]

F1-Score: 0.2728904847396768
Train loss: 0.04946502171757553
Validation loss: 0.06034059411044731
Validation Accuracy: 0.9593567737172385


Epoch:  80%|████████  | 4/5 [5:33:14<1:23:16, 4996.01s/it]

F1-Score: 0.28550111912459586
Train loss: 0.044028967786289286
Validation loss: 0.06569268190583517
Validation Accuracy: 0.9605011074197122


Epoch: 100%|██████████| 5/5 [6:55:53<00:00, 4990.74s/it]  

F1-Score: 0.2888402625820569





In [32]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Validation loss: 0.06569268190583517
Validation Accuracy: 0.9605011074197122
Validation F1-Score: 0.4077824101647807


In [46]:
i = 10
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(sentences[i].split(), pred_tags[i], valid_tags[i]):
    if w != 0:
        print("{:15}: {:5} {}".format(w, t, pred))


Word           ||True ||Pred
Kokkuvõte:     : object object
SKG            : value value
valem:         : unk   unk
1D             : unk   unk
A(211)         : unk   unk
C(31)          : unk   unk
OM(2)          : unk   unk
D(201)         : unk   unk
PTCA+BMS       : unk   unk
C(31)-(11)     : unk   unk


In [53]:
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print("F1-score: {:.1%}".format(f1_score(valid_tags,pred_tags)))
print(classification_report(valid_tags, pred_tags))


F1-score: 40.8%
           precision    recall  f1-score   support

      unk       0.36      0.47      0.41      2237
   object       0.33      0.54      0.41      1044
    value       0.32      0.56      0.41       784

micro avg       0.34      0.51      0.41      4065
macro avg       0.34      0.51      0.41      4065



In [54]:
from sklearn_crfsuite.metrics import flat_classification_report,flat_accuracy_score
report = flat_classification_report(y_pred=pred_tags, y_true=valid_tags)
print(report)
print("Accuracy",flat_accuracy_score(y_pred=pred_tags, y_true=valid_tags))

              precision    recall  f1-score   support

      object       0.21      0.55      0.30      1053
         unk       0.99      0.97      0.98    100537
       value       0.28      0.57      0.38       785

    accuracy                           0.96    102375
   macro avg       0.49      0.70      0.55    102375
weighted avg       0.98      0.96      0.97    102375

Accuracy 0.9604590964590964
