In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import logging
import json
import re
import os

import numpy as np
import pandas as pd

import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler, RandomSampler
from torch.utils.data.distributed import DistributedSampler

from torch.nn import CrossEntropyLoss

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME

from tqdm import tqdm, trange

from sklearn.metrics import f1_score

In [2]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

# Preprocessing and preparation

In [3]:
class InputExample(object):

    def __init__(self, unique_id, text_a, text_b, output_label):
        self.unique_id = unique_id
        self.text_a = text_a
        self.text_b = text_b
        self.output_label = output_label


In [4]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids, label_id):
        self.unique_id = unique_id
        self.tokens = tokens
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids #segment ids
        self.label_id = label_id


In [5]:
def convert_examples_to_features(examples, seq_length, tokenizer):
    """Loads a data file into a list of `InputFeature`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > seq_length - 2:
                tokens_a = tokens_a[0:(seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        input_type_ids = []
        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                input_type_ids.append(1)
            tokens.append("[SEP]")
            input_type_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)

        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length
        assert len(input_type_ids) == seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))

        features.append(
            InputFeatures(
                unique_id=example.unique_id,
                tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids,
                label_id=example.output_label))
    return features


In [6]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


## Let's begin!

Using GPU:

In [7]:
n_gpu = torch.cuda.device_count()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info("device: {} n_gpu: {}".format(device, n_gpu))

05/30/2019 14:42:00 - INFO - __main__ -   device: cuda n_gpu: 1


In [8]:
layer_indexes = [-1, -2, -3, -4]

Using wordpiece tokenizer for casesd multilingual text

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

05/30/2019 14:42:00 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at /home/liad/.pytorch_pretrained_bert/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729


In [10]:
df = pd.read_csv('sample_dataset_bert.csv', sep='\t', encoding='utf-8')
df

Unnamed: 0,Class,SentenceA,SentenceB
0,1,Aufgrund dieser Rechtsvorschriften ist von zwe...,· der naturschutzrechtlichen Eingriffsregelung...
1,0,Der Bodenaufbau in der Pflanzwanne setzte sich...,Bewässert wurde bedarfsgerecht über einen Trop...
2,1,Maximal die Hälfte der Fertigstellungskosten z...,Die ersten Maßnahmen wurden bereits umgesetzt.
3,1,Alles was zum Thema Bauwerksbegrünung angebote...,-- Bereitstellung verschiedener Praxisbeispiel...
4,1,Retentionsdächer werden in Deutschland immer h...,Oftmals können nur so die Vorgaben zur einleit...
5,1,Dabei steht ihnen eine große Bandbreite an Mög...,Die gesetzlichen Regelungen basieren auf den V...
6,0,BILDQUELLE: OPTIGRÜN,Der gestalterischen Vielfalt sind somit kaum G...
7,1,Abb.,3: Verbindliche Festlegung von Dachbegrünungen...
8,0,Wolfgang Dickhaut und Dipl.-Geoökol.,"Michael Richter, Fachgebiet umweltgerechte Sta..."
9,1,Eine völlig neue Regelung zugunsten der Bauunt...,"Immer dann, wenn der Besteller zu Recht die Ab..."


Dividing to training / validation set

In [11]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [12]:
examples = []
for i, row in train.iterrows():
    examples.append(
                InputExample(unique_id=i, 
                             text_a=row['SentenceA'], 
                             text_b=row['SentenceB'],
                             output_label=row['Class']))

In [13]:
validation = []
for i, row in test.iterrows():
    validation.append(
                InputExample(unique_id=i, 
                             text_a=row['SentenceA'], 
                             text_b=row['SentenceB'],
                             output_label=row['Class']))

In [14]:
features = convert_examples_to_features(
    examples=examples, seq_length=128, tokenizer=tokenizer)

05/30/2019 14:42:01 - INFO - __main__ -   *** Example ***
05/30/2019 14:42:01 - INFO - __main__ -   unique_id: 1
05/30/2019 14:42:01 - INFO - __main__ -   tokens: [CLS] Der Boden ##auf ##bau in der P ##f ##lan ##z ##wan ##ne setzte sich aus einem Schutz ##v ##lies , 4 cm Dr ##aina ##ges ##chi ##cht aus B ##lä ##hs ##chie ##fer , einem Fi ##lter ##v ##lies und 16 cm Int ##ensi ##vs ##ub ##strat zusammen . [SEP] Be ##w ##ässer ##t wurde bed ##arf ##sg ##ere ##cht über einen T ##rop ##fs ##ch ##lau ##ch . [SEP]
05/30/2019 14:42:01 - INFO - __main__ -   input_ids: 101 10445 37173 103703 15871 10106 10118 153 10575 12055 10305 14394 10238 24430 10372 10441 10745 40243 10477 22201 117 125 11207 11612 77262 13156 12806 11640 10441 139 26875 22394 50784 14854 117 10745 36448 31897 10477 22201 10130 10250 11207 81687 48019 30168 20664 51351 14531 119 102 14321 10874 75842 10123 10283 30113 66313 84105 12122 11640 10848 10897 157 30698 25743 10269 35166 10269 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [15]:
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature


Preparing the input as a Tensor:

In [16]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)

all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
# outputs:
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

And then aggregating it all into a TensorDataset:

In [17]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=12)

Loading the BERT model itself.

We're using the *BertForSequenceClassification* architecture, where the last layer is expected to be the probability output for each class:

In [18]:
num_labels = 2

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
              num_labels=num_labels)

05/30/2019 14:42:02 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz from cache at /home/liad/.pytorch_pretrained_bert/731c19ddf94e294e00ec1ba9a930c69cc2a0fd489b25d3d691373fae4c0986bd.4e367b0d0155d801930846bb6ed98f8a7c23e0ded37888b29caa37009a40c7b9
05/30/2019 14:42:02 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /home/liad/.pytorch_pretrained_bert/731c19ddf94e294e00ec1ba9a930c69cc2a0fd489b25d3d691373fae4c0986bd.4e367b0d0155d801930846bb6ed98f8a7c23e0ded37888b29caa37009a40c7b9 to temp dir /tmp/tmp7qcixd2w
05/30/2019 14:42:06 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_l

Moving the model to GPU:

In [19]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermedia

And setting it into training mode:

In [20]:
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermedia

Some hyperparameter settings.

The warmup is unique for BERT. It basically affects the learning rate value. 

-1 means avoiding warmup - as we don't have enough data to warmup before the actual training.

In [21]:
gradient_accumulation_steps = 1
fp16 = False #change this to True to use 16bit precision

learning_rate = 5e-5
warmup_proportion = -1

train_batch_size = 4
train_size = 50
num_train_epochs = 10

In [22]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

In [23]:
optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

In [24]:
num_train_optimization_steps = int(
            train_size / train_batch_size / gradient_accumulation_steps) * num_train_epochs

Setting up the BertAdam optimizer with the values we set above:

In [25]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=learning_rate,
                     warmup=warmup_proportion,
                     t_total=num_train_optimization_steps)

Actual Training loop:

In [26]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0


for _ in trange(num_train_epochs, desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        # define a new function to compute loss values for both output_modes
        # logits = model(input_ids, segment_ids, input_mask, labels=None)
        
        
        #ToDo - check if the results are better without CEL and calculating directly on the prev line

        #loss_fct = CrossEntropyLoss()
        # loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        
        loss = model(input_ids, segment_ids, input_mask, labels=label_ids)

        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        if fp16:
            optimizer.backward(loss)
        else:
            loss.backward()

        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]
Iteration:   0%|          | 0/4 [00:00<?, ?it/s][A
Iteration:  25%|██▌       | 1/4 [00:00<00:01,  2.73it/s][A
Iteration:  50%|█████     | 2/4 [00:00<00:00,  2.88it/s][A
Iteration:  75%|███████▌  | 3/4 [00:00<00:00,  2.98it/s][A
Epoch:  10%|█         | 1/10 [00:01<00:10,  1.14s/it]/s][A
Iteration:   0%|          | 0/4 [00:00<?, ?it/s][A
Iteration:  25%|██▌       | 1/4 [00:00<00:00,  3.22it/s][A
Iteration:  50%|█████     | 2/4 [00:00<00:00,  3.23it/s][A
Iteration:  75%|███████▌  | 3/4 [00:00<00:00,  3.23it/s][A
Epoch:  20%|██        | 2/10 [00:02<00:09,  1.14s/it]/s][A
Iteration:   0%|          | 0/4 [00:00<?, ?it/s][A
Iteration:  25%|██▌       | 1/4 [00:00<00:01,  2.68it/s][A
Iteration:  50%|█████     | 2/4 [00:00<00:00,  2.71it/s][A
Iteration:  75%|███████▌  | 3/4 [00:01<00:00,  2.83it/s][A
Epoch:  30%|███       | 3/10 [00:03<00:08,  1.16s/it]/s][A
Iteration:   0%|          | 0/4 [00:00<?, ?it/s][A
Iteration:  25%|██▌       

And saving the output model:

In [27]:
output_dir = '../models/'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

In [28]:
output_model_file

'../models/pytorch_model.bin'

In [29]:

# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)


'../models/vocab.txt'

## Evaluation

In [30]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }

We load the trained model and vocabulary that we've have fine-tuned

In [31]:
model = BertForSequenceClassification.from_pretrained(output_dir, num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=False)

05/30/2019 14:42:26 - INFO - pytorch_pretrained_bert.modeling -   loading archive file ../models/
05/30/2019 14:42:26 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 119547
}

05/30/2019 14:42:29 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file ../models/vocab.txt


In [32]:
validation_features = convert_examples_to_features(
    examples=validation, seq_length=128, tokenizer=tokenizer)

validation_input_ids = torch.tensor([f.input_ids for f in validation_features], dtype=torch.long)
validation_input_mask = torch.tensor([f.input_mask for f in validation_features], dtype=torch.long)
validation_segment_ids = torch.tensor([f.input_type_ids for f in validation_features], dtype=torch.long)

validation_example_index = torch.arange(validation_input_ids.size(0), dtype=torch.long)
# outputs:
validation_label_ids = torch.tensor([f.label_id for f in validation_features], dtype=torch.long)

05/30/2019 14:42:29 - INFO - __main__ -   *** Example ***
05/30/2019 14:42:29 - INFO - __main__ -   unique_id: 0
05/30/2019 14:42:29 - INFO - __main__ -   tokens: [CLS] Aufgrund dieser Rechts ##vor ##schriften ist von zwei Anwendung ##sbereich ##en aus ##zug ##ehen : [SEP] · der natur ##schutz ##recht ##lichen Ein ##griff ##sr ##ege ##lung und · der Ein ##griff ##sr ##ege ##lung in der Bau ##lei ##t ##plan ##ung . [SEP]
05/30/2019 14:42:29 - INFO - __main__ -   input_ids: 101 31513 11906 79037 19360 64998 10298 10166 11615 53671 68440 10136 10441 21062 35947 131 102 217 10118 84051 53057 20913 12924 12210 88508 106986 46471 25497 10130 217 10118 12210 88508 106986 46471 25497 10106 10118 18727 36777 10123 31609 10716 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/30/2019 14:42:29 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [33]:
validation_input_ids.shape

torch.Size([11, 128])

In [34]:
eval_data = TensorDataset(validation_input_ids, validation_input_mask, validation_segment_ids, validation_label_ids)

In [35]:
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)

In [36]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermedia

In [37]:
eval_loss = 0
nb_eval_steps = 0
preds = []

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)

    loss_fct = CrossEntropyLoss()
    tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

    eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)


Evaluating: 100%|██████████| 11/11 [00:00<00:00, 69.28it/s]


In [38]:
eval_loss = eval_loss / nb_eval_steps
preds = preds[0]
preds = np.argmax(preds, axis=1)
result = acc_and_f1(preds, validation_label_ids.numpy())
loss = tr_loss/global_step

result['eval_loss'] = eval_loss
result['global_step'] = global_step
result['loss'] = loss


In [39]:
result

{'acc': 0.36363636363636365,
 'f1': 0.2222222222222222,
 'acc_and_f1': 0.29292929292929293,
 'eval_loss': 0.7274613380432129,
 'global_step': 40,
 'loss': 0.07344732731580735}

In [40]:
from sklearn import metrics
print(metrics.classification_report(validation_label_ids, preds))

              precision    recall  f1-score   support

           0       0.30      1.00      0.46         3
           1       1.00      0.12      0.22         8

   micro avg       0.36      0.36      0.36        11
   macro avg       0.65      0.56      0.34        11
weighted avg       0.81      0.36      0.29        11



In [41]:
preds

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [42]:
validation_label_ids

tensor([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0])