# BERT model

# STRUCTURE
1. Load the training data.
2. Adapt data to BERT
3. Obtain a pretrained BERT and fine-tune.
4. Evaluate BERT results.

## 1. Load the training data

In [8]:
import pickle
from sklearn.model_selection import train_test_split
import os

In [6]:
DIRECTORY_NAME = input("Specify the directory you wish to save ALL the data to: ")
os.makedirs(DIRECTORY_NAME)

Specify the directory you wish to save ALL the data to: BERT_results_final


FileExistsError: [Errno 17] File exists: 'BERT_results_final'

In [2]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

In [9]:
X_data_1 = load_pickle_data("/home/lovhag/storage/data/BERT_data_final/with_eval_X_data_1_train.pickle")
X_data_2 = load_pickle_data("/home/lovhag/storage/data/BERT_data_final/with_eval_X_data_2_train.pickle")
y_data = load_pickle_data("/home/lovhag/storage/data/BERT_data_final/with_eval_y_data_train.pickle")

In [15]:
X_data_1_test = load_pickle_data("/home/lovhag/storage/data/BERT_data_final/with_eval_X_data_1_test.pickle")
X_data_2_test = load_pickle_data("/home/lovhag/storage/data/BERT_data_final/with_eval_X_data_2_test.pickle")
y_data_test = load_pickle_data("/home/lovhag/storage/data/BERT_data_final/with_eval_y_data_test.pickle")

In [10]:
print(f"Number of data samples: {len(y_data)}")

Number of data samples: 121678


In [11]:
X_data_1[1]

'( vii ) BGL - General Trust Fund for the Core Programme Budget for the Biosafety Protocol , which is extended through 31 December 2011 ; ( viii ) BHL - Special Voluntary Trust Fund for Additional Voluntary Contributions in Support of Approved Activities of the Biosafety Protocol , which is extended through 31 December 2011 ; ( ix ) BTL - General Trust Fund for the Conservation of European Bats ( EUROBATS ) , which is extended through 31 December 2014 ;'

In [12]:
X_data_2[1]

'Following consultation in 2007 , the Government is considering replacing the current legislation with a single Equality Act . As part of this , it is considering the case for extending protection from age discrimination outside the workplace and for extending positive duties on public authorities to the other protected grounds . In Northern Ireland , additional protections have been established to promote equality .'

## 2. Adapt data to BERT

In [12]:
import torch
from transformers import BertTokenizer

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(X_data_1, X_data_2, truncation=True, padding=True)

In [16]:
test_encodings = tokenizer(X_data_1_test, X_data_2_test, truncation=True, padding=True)

In [17]:
class SenseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SenseDataset(train_encodings, y_data)
test_dataset = SenseDataset(test_encodings, y_data_test)

In [18]:
len(train_dataset)

121678

In [19]:
test_dataset[0]

{'input_ids': tensor([  101,  2112,  1045,  1011,  4955, 24497,  2177,  2003,  1037,  2877,
          1010,  2248,  1010,  2512,  1011, 10605,  5502,  4056,  2000, 12771,
          2152,  3737,  1010,  2711,  1011, 16441,  2740,  1998,  2591,  2729,
          1010,  2731,  1010,  2495,  1010,  6107,  1010, 11252,  1998,  3293,
          2578,  2000,  2111,  2007, 13597,  1010,  3080,  2111,  1998,  2500,
          2040,  2024, 14785,  3550,  1012,  2144,  2494,  1010, 24497,  2177,
          2038,  5281,  6196,  4935,  1012,   102,  2011,  2555,  1010,  5120,
          2001,  2034,  1999,  2885,  1010,  2007,  1996,  5569,  3072,  4577,
          3284,  2426,  1996,  2647,  3032,  2007, 18730,  1999,  6653,  1012,
          1999,  3088,  1010,  5279,  2001,  2877,  1010,  2007,  2148,  3088,
          4577,  3284,  2426,  1996,  3032,  1997,  4942,  1011, 24505,  3088,
          1012,  1999,  4021,  1010,  5264,  2001,  2877,  1010,  2004,  2001,
          4380,  1999,  2148,  2637,  1

In [20]:
def save_data_with_pickle(data_dict, folder_name=None):
    if not folder_name:
        folder_name = input(f"Specify which prefix filename you wish to save {list(data_dict.keys())} to: ")
    if folder_name:
        for key, value in data_dict.items():
            filename = folder_name+"/"+key+".pickle"
            with open(filename, "wb") as fp:   #Pickling
                pickle.dump(value, fp)

In [21]:
save_datasets_to_folder = input("Which folder should the datasets be saved to?: ")
save_data_with_pickle({"train_dataset": train_dataset, "test_dataset": test_dataset}, save_datasets_to_folder)

Which folder should the datasets be saved to?: /home/lovhag/storage/data/BERT_data_final/


## 3. Obtain a pretrained BERT and fine-tune

In [1]:
import pickle
import torch

In [2]:
class SenseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [3]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

train_dataset = load_pickle_data("/home/lovhag/storage/data/BERT_data_final/train_dataset.pickle")
test_dataset = load_pickle_data("/home/lovhag/storage/data/BERT_data_final/test_dataset.pickle")

In [4]:
import gc
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
    except:
        pass

  return _bootstrap._gcd_import(name[level:], package, level)


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Using', device)

Using cuda


In [6]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
).to(device)

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
#for param in model.base_model.parameters():
#    param.requires_grad = False

# keep the weights of the embedding layer frozen
for param in model.bert.embeddings.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
from transformers import BertForNextSentencePrediction

model = BertForNextSentencePrediction.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.  
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
).to(device)

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
for param in model.base_model.parameters():
    param.requires_grad = False

# keep the weights of the embedding layer frozen
#for param in model.bert.embeddings.parameters():
#    param.requires_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def print_model_params(model):
    # Get all of the model's parameters as a list of tuples.
    params = list(model.named_parameters())

    print('The BERT model has {:} different named parameters.\n'.format(len(params)))

    print('==== Embedding Layer ====\n')

    for p in params[0:5]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    print('\n==== First Transformer ====\n')

    for p in params[5:21]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    print('\n==== Output Layer ====\n')

    for p in params[-4:]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [9]:
print_model_params(model)

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [10]:
from transformers import get_linear_schedule_with_warmup
from transformers.optimization import AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import numpy as np
import time
import datetime

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
def get_train_dataloader(train_dataset, batch_size):
    return DataLoader(
                train_dataset,  # The training samples.
                sampler = RandomSampler(train_dataset), # Select batches randomly
                batch_size = batch_size # Trains with this batch size.
            )

# For validation the order doesn't matter, so we'll just read them sequentially.
def get_validation_dataloader(validation_dataset, batch_size):
    return DataLoader(
                validation_dataset, # The validation samples.
                sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
                batch_size = batch_size # Evaluate with this batch size.
            )

In [11]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

def train(model, epochs, batch_size, optimizer, train_dataset, validation_dataset, use_labels=True):
    # Set the seed value all over the place to make this reproducible.
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    
    train_dataloader = get_train_dataloader(train_dataset, batch_size)
    validation_dataloader = get_validation_dataloader(validation_dataset, batch_size)
    
    # Total number of training steps is [number of batches] x [number of epochs]. 
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)

    # We'll store a number of quantities such as training and validation loss, 
    # validation accuracy, and timings.
    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):
        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Put the model into training mode. Don't be mislead--the call to 
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            #print(f"step: {step}")
            
            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the 
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch["input_ids"].to(device)
            b_token_type_ids = batch["token_type_ids"].to(device)
            b_input_mask = batch["attention_mask"].to(device)
            b_labels = batch["labels"].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because 
            # accumulating the gradients is "convenient while training RNNs". 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()        

            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.
            if use_labels:
                loss, logits = model(b_input_ids, 
                                     token_type_ids=b_token_type_ids, 
                                     attention_mask=b_input_mask, 
                                     labels=b_labels)
            else:
                loss, logits = model(b_input_ids, 
                                     token_type_ids=b_token_type_ids, 
                                     attention_mask=b_input_mask, 
                                     next_sentence_label=b_labels)

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)            

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:

            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using 
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch["input_ids"].to(device)
            b_token_type_ids = batch["token_type_ids"].to(device)
            b_input_mask = batch["attention_mask"].to(device)
            b_labels = batch["labels"].to(device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                if use_labels:
                    loss, logits = model(b_input_ids, 
                                         token_type_ids=b_token_type_ids, 
                                         attention_mask=b_input_mask, 
                                         labels=b_labels)
                else:
                    loss, logits = model(b_input_ids, 
                                         token_type_ids=b_token_type_ids, 
                                         attention_mask=b_input_mask, 
                                         next_sentence_label=b_labels)

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += flat_accuracy(logits, label_ids)


        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [12]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
train(model, epochs=4, batch_size=16, optimizer=optimizer, train_dataset=train_dataset, validation_dataset=test_dataset, use_labels=False)


Training...
  Batch    40  of  7,605.    Elapsed: 0:00:19.
  Batch    80  of  7,605.    Elapsed: 0:00:38.
  Batch   120  of  7,605.    Elapsed: 0:00:57.
  Batch   160  of  7,605.    Elapsed: 0:01:16.
  Batch   200  of  7,605.    Elapsed: 0:01:36.
  Batch   240  of  7,605.    Elapsed: 0:01:57.
  Batch   280  of  7,605.    Elapsed: 0:02:17.
  Batch   320  of  7,605.    Elapsed: 0:02:37.
  Batch   360  of  7,605.    Elapsed: 0:02:57.
  Batch   400  of  7,605.    Elapsed: 0:03:17.
  Batch   440  of  7,605.    Elapsed: 0:03:37.
  Batch   480  of  7,605.    Elapsed: 0:03:57.
  Batch   520  of  7,605.    Elapsed: 0:04:17.
  Batch   560  of  7,605.    Elapsed: 0:04:37.
  Batch   600  of  7,605.    Elapsed: 0:04:57.
  Batch   640  of  7,605.    Elapsed: 0:05:17.
  Batch   680  of  7,605.    Elapsed: 0:05:37.
  Batch   720  of  7,605.    Elapsed: 0:05:57.
  Batch   760  of  7,605.    Elapsed: 0:06:17.
  Batch   800  of  7,605.    Elapsed: 0:06:37.
  Batch   840  of  7,605.    Elapsed: 0:06:57.


  Batch 7,000  of  7,605.    Elapsed: 0:58:19.
  Batch 7,040  of  7,605.    Elapsed: 0:58:39.
  Batch 7,080  of  7,605.    Elapsed: 0:58:59.
  Batch 7,120  of  7,605.    Elapsed: 0:59:19.
  Batch 7,160  of  7,605.    Elapsed: 0:59:39.
  Batch 7,200  of  7,605.    Elapsed: 0:59:59.
  Batch 7,240  of  7,605.    Elapsed: 1:00:19.
  Batch 7,280  of  7,605.    Elapsed: 1:00:39.
  Batch 7,320  of  7,605.    Elapsed: 1:00:59.
  Batch 7,360  of  7,605.    Elapsed: 1:01:19.
  Batch 7,400  of  7,605.    Elapsed: 1:01:39.
  Batch 7,440  of  7,605.    Elapsed: 1:01:59.
  Batch 7,480  of  7,605.    Elapsed: 1:02:19.
  Batch 7,520  of  7,605.    Elapsed: 1:02:40.
  Batch 7,560  of  7,605.    Elapsed: 1:03:00.
  Batch 7,600  of  7,605.    Elapsed: 1:03:20.

  Average training loss: 0.81
  Training epcoh took: 1:03:22

Running Validation...
  Accuracy: 0.59
  Validation Loss: 0.66
  Validation took: 0:14:52

Training...
  Batch    40  of  7,605.    Elapsed: 0:00:20.
  Batch    80  of  7,605.    Elapse

  Batch 6,240  of  7,605.    Elapsed: 0:52:03.
  Batch 6,280  of  7,605.    Elapsed: 0:52:23.
  Batch 6,320  of  7,605.    Elapsed: 0:52:43.
  Batch 6,360  of  7,605.    Elapsed: 0:53:03.
  Batch 6,400  of  7,605.    Elapsed: 0:53:23.
  Batch 6,440  of  7,605.    Elapsed: 0:53:43.
  Batch 6,480  of  7,605.    Elapsed: 0:54:03.
  Batch 6,520  of  7,605.    Elapsed: 0:54:23.
  Batch 6,560  of  7,605.    Elapsed: 0:54:43.
  Batch 6,600  of  7,605.    Elapsed: 0:55:03.
  Batch 6,640  of  7,605.    Elapsed: 0:55:23.
  Batch 6,680  of  7,605.    Elapsed: 0:55:43.
  Batch 6,720  of  7,605.    Elapsed: 0:56:03.
  Batch 6,760  of  7,605.    Elapsed: 0:56:23.
  Batch 6,800  of  7,605.    Elapsed: 0:56:43.
  Batch 6,840  of  7,605.    Elapsed: 0:57:03.
  Batch 6,880  of  7,605.    Elapsed: 0:57:23.
  Batch 6,920  of  7,605.    Elapsed: 0:57:43.
  Batch 6,960  of  7,605.    Elapsed: 0:58:03.
  Batch 7,000  of  7,605.    Elapsed: 0:58:23.
  Batch 7,040  of  7,605.    Elapsed: 0:58:43.
  Batch 7,080

  Batch 5,480  of  7,605.    Elapsed: 0:45:42.
  Batch 5,520  of  7,605.    Elapsed: 0:46:02.
  Batch 5,560  of  7,605.    Elapsed: 0:46:22.
  Batch 5,600  of  7,605.    Elapsed: 0:46:42.
  Batch 5,640  of  7,605.    Elapsed: 0:47:02.
  Batch 5,680  of  7,605.    Elapsed: 0:47:22.
  Batch 5,720  of  7,605.    Elapsed: 0:47:42.
  Batch 5,760  of  7,605.    Elapsed: 0:48:02.
  Batch 5,800  of  7,605.    Elapsed: 0:48:23.
  Batch 5,840  of  7,605.    Elapsed: 0:48:43.
  Batch 5,880  of  7,605.    Elapsed: 0:49:03.
  Batch 5,920  of  7,605.    Elapsed: 0:49:23.
  Batch 5,960  of  7,605.    Elapsed: 0:49:43.
  Batch 6,000  of  7,605.    Elapsed: 0:50:03.
  Batch 6,040  of  7,605.    Elapsed: 0:50:23.
  Batch 6,080  of  7,605.    Elapsed: 0:50:43.
  Batch 6,120  of  7,605.    Elapsed: 0:51:03.
  Batch 6,160  of  7,605.    Elapsed: 0:51:23.
  Batch 6,200  of  7,605.    Elapsed: 0:51:43.
  Batch 6,240  of  7,605.    Elapsed: 0:52:03.
  Batch 6,280  of  7,605.    Elapsed: 0:52:23.
  Batch 6,320

  Batch 4,720  of  7,605.    Elapsed: 0:39:22.
  Batch 4,760  of  7,605.    Elapsed: 0:39:42.
  Batch 4,800  of  7,605.    Elapsed: 0:40:02.
  Batch 4,840  of  7,605.    Elapsed: 0:40:22.
  Batch 4,880  of  7,605.    Elapsed: 0:40:42.
  Batch 4,920  of  7,605.    Elapsed: 0:41:02.
  Batch 4,960  of  7,605.    Elapsed: 0:41:22.
  Batch 5,000  of  7,605.    Elapsed: 0:41:42.
  Batch 5,040  of  7,605.    Elapsed: 0:42:02.
  Batch 5,080  of  7,605.    Elapsed: 0:42:22.
  Batch 5,120  of  7,605.    Elapsed: 0:42:42.
  Batch 5,160  of  7,605.    Elapsed: 0:43:02.
  Batch 5,200  of  7,605.    Elapsed: 0:43:22.
  Batch 5,240  of  7,605.    Elapsed: 0:43:42.
  Batch 5,280  of  7,605.    Elapsed: 0:44:02.
  Batch 5,320  of  7,605.    Elapsed: 0:44:22.
  Batch 5,360  of  7,605.    Elapsed: 0:44:42.
  Batch 5,400  of  7,605.    Elapsed: 0:45:02.
  Batch 5,440  of  7,605.    Elapsed: 0:45:22.
  Batch 5,480  of  7,605.    Elapsed: 0:45:42.
  Batch 5,520  of  7,605.    Elapsed: 0:46:02.
  Batch 5,560

In [13]:
MODEL_PATH = input("Specify the path you wish to save the Attention model to: ")
torch.save(model.state_dict(), MODEL_PATH)

Specify the path you wish to save the Attention model to: model_3_acc_60


## RESULTS
1. BertForSequenceClassification (num_labels=2) basic layers frozen. Validation accuracy 0.6.
2. BertForSequenceClassification (num_labels=2) basic layers not frozen. MEMORY SHORTAGE
3. BertForNextSentencePrediction basic layers not frozen. Validation accuracy 

In [30]:
batch

{'input_ids': tensor([[  101,  2023,  4489,  ...,     0,     0,     0],
         [  101,  2138,  1996,  ...,     0,     0,     0],
         [  101,  1006,  1037,  ...,     0,     0,     0],
         ...,
         [  101,  2490,  2005,  ...,     0,     0,     0],
         [  101, 23089, 22773,  ...,     0,     0,     0],
         [  101,  1996,  5675,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 0, 0, 1, 1, 0, 0, 0])}

In [29]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir=DIRECTORY_NAME+'/BERT_results_final',          # output directory
    num_train_epochs=4,              # total # of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=DIRECTORY_NAME+'/BERT_logs_final',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)

trainer.train()

trainer.save_model(DIRECTORY_NAME+'/BERT_model_final')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1902.0, style=ProgressStyle(description_w…

{'loss': 0.6875601806640625, 'learning_rate': 5e-05, 'epoch': 0.2628811777076761, 'total_flos': 10762693312512000, 'step': 500}
{'loss': 0.671542236328125, 'learning_rate': 4.648283624085538e-05, 'epoch': 0.5257623554153522, 'total_flos': 21525386625024000, 'step': 1000}
{'loss': 0.6713079833984374, 'learning_rate': 4.296567248171075e-05, 'epoch': 0.7886435331230284, 'total_flos': 32288079937536000, 'step': 1500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1902.0, style=ProgressStyle(description_w…

{'loss': 0.6677021484375, 'learning_rate': 3.9448508722566127e-05, 'epoch': 1.0515247108307044, 'total_flos': 43033956541747200, 'step': 2000}
{'loss': 0.66580810546875, 'learning_rate': 3.59313449634215e-05, 'epoch': 1.3144058885383807, 'total_flos': 53796649854259200, 'step': 2500}
{'loss': 0.66675439453125, 'learning_rate': 3.241418120427687e-05, 'epoch': 1.5772870662460567, 'total_flos': 64559343166771200, 'step': 3000}
{'loss': 0.665990478515625, 'learning_rate': 2.8897017445132247e-05, 'epoch': 1.840168243953733, 'total_flos': 75322036479283200, 'step': 3500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1902.0, style=ProgressStyle(description_w…

{'loss': 0.66303662109375, 'learning_rate': 2.537985368598762e-05, 'epoch': 2.103049421661409, 'total_flos': 86067913083494400, 'step': 4000}
{'loss': 0.6658017578125, 'learning_rate': 2.1862689926842993e-05, 'epoch': 2.365930599369085, 'total_flos': 96830606396006400, 'step': 4500}
{'loss': 0.66045068359375, 'learning_rate': 1.8345526167698368e-05, 'epoch': 2.6288117770767614, 'total_flos': 107593299708518400, 'step': 5000}
{'loss': 0.66278076171875, 'learning_rate': 1.4828362408553741e-05, 'epoch': 2.891692954784437, 'total_flos': 118355993021030400, 'step': 5500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1902.0, style=ProgressStyle(description_w…

{'loss': 0.66191162109375, 'learning_rate': 1.1311198649409118e-05, 'epoch': 3.1545741324921135, 'total_flos': 129101869625241600, 'step': 6000}
{'loss': 0.66073193359375, 'learning_rate': 7.79403489026449e-06, 'epoch': 3.4174553101997898, 'total_flos': 139864562937753600, 'step': 6500}
{'loss': 0.6628544921875, 'learning_rate': 4.276871131119865e-06, 'epoch': 3.680336487907466, 'total_flos': 150627256250265600, 'step': 7000}
{'loss': 0.6613369140625, 'learning_rate': 7.597073719752392e-07, 'epoch': 3.943217665615142, 'total_flos': 161389949562777600, 'step': 7500}




In [32]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir=DIRECTORY_NAME+'/BERT_results_final_larger_lr',          # output directory
    num_train_epochs=4,              # total # of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    learning_rate=0.001,
    logging_dir=DIRECTORY_NAME+'/BERT_logs_final_larger_lr',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)

trainer.train()

trainer.save_model(DIRECTORY_NAME+'/BERT_model_final_larger_lr')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1902.0, style=ProgressStyle(description_w…

KeyboardInterrupt: 

In [13]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

pt_model = BertForSequenceClassification.from_pretrained("/home/lovhag/projects/dl4nlp_assignment_1/BERT_results/BERT_model_1")

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
for param in pt_model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir=DIRECTORY_NAME+'/BERT_results_pt_1',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=DIRECTORY_NAME+'/BERT_logs_pt_1',            # directory for storing logs
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=pt_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
)

trainer.train()

trainer.save_model(DIRECTORY_NAME+'/BERT_model_pt_1')

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1902.0, style=ProgressStyle(description_w…

{'loss': 0.6672554931640625, 'learning_rate': 3.6855941114616195e-05, 'epoch': 0.2628811777076761, 'total_flos': 10762693312512000, 'step': 500}
{'loss': 0.665552490234375, 'learning_rate': 2.3711882229232387e-05, 'epoch': 0.5257623554153522, 'total_flos': 21525386625024000, 'step': 1000}
{'loss': 0.664870361328125, 'learning_rate': 1.056782334384858e-05, 'epoch': 0.7886435331230284, 'total_flos': 32288079937536000, 'step': 1500}



HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=476.0, style=ProgressStyle(description_w…


{'eval_loss': 0.6630513711166883, 'epoch': 1.0, 'total_flos': 40924468652494848, 'step': 1902}



In [25]:
from transformers import BertForNextSentencePrediction, Trainer, TrainingArguments

model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir=DIRECTORY_NAME+'/BERT_results_final',          # output directory
    num_train_epochs=4,              # total # of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=DIRECTORY_NAME+'/BERT_logs_final',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)

trainer.train()

trainer.save_model(DIRECTORY_NAME+'/BERT_model_final')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1902.0, style=ProgressStyle(description_w…

TypeError: forward() got an unexpected keyword argument 'labels'

In [None]:
trainer.save_model(DIRECTORY_NAME)
tokenizer.save_pretrained(DIRECTORY_NAME)

## 4. Check the results