In [1]:
from transformers import BertConfig , BertTokenizer , BertForSequenceClassification
from transformers import RobertaConfig , RobertaTokenizer , RobertaForSequenceClassification
import torch

MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}

# In[2]:


model_type = "bert"
model_base = "bert-base-uncased"

config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
config = config_class.from_pretrained(model_base)
tokenizer = tokenizer_class.from_pretrained(model_base, do_lower_case=True)
model = model_class.from_pretrained(model_base)

import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', -1)
pd.set_option('display.max_colwidth', -1)
import warnings

warnings.simplefilter('ignore')

train = pd.read_csv("ready_to_serve_train.csv")
dev = pd.read_csv("ready_to_serve_dev.csv")


In [2]:
train.head(100)

dev.head(5)

sample_sentence = train.edited_head_line[40]
print(' Original: ', sample_sentence)

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sample_sentence))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample_sentence)))

# ### convert sentences to ids

# In[8]:


input_ids = []

# For every sentence...
for sent in train.edited_head_line.values:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
        sent,  # Sentence to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'

        # This function also supports truncation and conversion
        # to pytorch tensors, but we need to do padding, so we
        # can't use these features :( .
        # max_length = 128,          # Truncate all sentences.
        # return_tensors = 'pt',     # Return pytorch tensors.
    )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', train.edited_head_line.values[0])
print('Token IDs:', input_ids[0])

 Original:  south korea conducts fire drill after north korea nuclear test rattles globe
Tokenized:  ['south', 'korea', 'conducts', 'fire', 'drill', 'after', 'north', 'korea', 'nuclear', 'test', 'rattle', '##s', 'globe']
Token IDs:  [2148, 4420, 17976, 2543, 12913, 2044, 2167, 4420, 4517, 3231, 23114, 2015, 7595]
Original:  france is hunting down its citizens who joined twins ’ without trial in iraq
Token IDs: [101, 2605, 2003, 5933, 2091, 2049, 4480, 2040, 2587, 8178, 1521, 2302, 3979, 1999, 5712, 102]


In [3]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

# In[10]:


# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 48

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",
                          value=0, truncating="post", padding="post")

print('\nDone.')

# In[11]:


# Create attention masks

def get_attention_masks(input_ids):
    attention_masks = []
    for sent in input_ids:
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]

        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)

    return attention_masks



Max sentence length:  37

Padding/truncating all sentences to 48 values...

Padding token: "[PAD]", ID: 0

Done.


Using TensorFlow backend.


In [4]:
attention_masks = get_attention_masks(input_ids)

# In[12]:


import numpy as np
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def precision_score_flat(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(pred_flat, labels_flat, average='weighted')


def recall_score_flat(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(pred_flat, labels_flat, average='weighted')


def f1_score_flat(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(pred_flat, labels_flat, average='weighted')


In [5]:
import time
import datetime


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [6]:
def train_for_cv(train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks,print_full_log=True):
    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)
    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)
    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
    # The DataLoader needs to know our batch size for training, so we specify it
    # here.
    # For fine-tuning BERT on a specific task, the authors recommend a batch size of
    # 16 or 32.
    batch_size = 20
    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
    # In[15]:
    from transformers import BertForSequenceClassification, AdamW
    device = torch.device("cuda")
    # Load BertForSequenceClassification, the pretrained BERT model with a single
    # linear classification layer on top.
    model = BertForSequenceClassification.from_pretrained(
        "bert-large-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=4,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=False,  # Whether the model returns all hidden-states.
    )
    # Tell pytorch to run this model on the GPU.
    model.to(device)
    # In[16]:
    params = list(model.named_parameters())
    # if print_full_log:
    #     print('The BERT model has {:} different named parameters.\n'.format(len(params)))
    #     print('==== Embedding Layer ====\n')
    #     for p in params[0:5]:
    #         print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    #     print('\n==== First Transformer ====\n')
    #     for p in params[5:21]:
    #         print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    #     print('\n==== Output Layer ====\n')
    #     for p in params[-4:]:
    #         print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer = AdamW(model.parameters(),
                      lr=5e-7,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                      eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                      )
    # In[18]:
    from transformers import get_linear_schedule_with_warmup
    # Number of training epochs (authors recommend between 2 and 4)
    epochs = 10
    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * epochs
    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,  # Default value in run_glue.py
                                                num_training_steps=total_steps)
    # In[19]:
    import numpy as np
    # Function to calculate the accuracy of our predictions vs labels

    # In[20]:
    import random
    # Set the seed value all over the place to make this reproducible.
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    # Store the average loss after each epoch so we can plot them.
    loss_values = []
    # For each epoch...
    for epoch_i in range(0, epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.
        if print_full_log:
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 0

        # Put the model into training mode. Don't be mislead--the call to
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            # if step % 40 == 0 and not step == 0 and print_full_log:
            #     # Calculate elapsed time in minutes.
            #     elapsed = format_time(time.time() - t0)
            #
            #     # Report progress.
            #     print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because
            # accumulating the gradients is "convenient while training RNNs".
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)

            # The call to `model` always returns a tuple, so we need to pull the
            # loss value out of the tuple.
            loss = outputs[0]

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.
            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)
        if print_full_log:
            print("")
            print("  Average training loss: {0:.2f}".format(avg_train_loss))
            print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

            # ========================================
            #               Validation
            # ========================================
            # After the completion of each training epoch, measure our performance on
            # our validation set.

            print("")
            print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables
        eval_loss = eval_accuracy = eval_precision = eval_recall = eval_f1 = tmp_eval_loss = 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch

            # Telling the model not to compute or store gradients, saving memory and
            # speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have
                # not provided labels.
                # token_type_ids is the same as the "segment ids", which
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here:
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)

            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            logits = outputs[1]
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            # Calculate the accuracy for this batch of test sentences.
            tmp_eval_loss = outputs[0]
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            tmp_eval_precision = precision_score_flat(logits, label_ids)
            tmp_eval_recall = recall_score_flat(logits, label_ids)
            tmp_eval_f1 = f1_score_flat(logits, label_ids)

            # Accumulate the total accuracy.
            eval_loss += tmp_eval_loss
            eval_accuracy += tmp_eval_accuracy
            eval_precision += tmp_eval_precision
            eval_recall += tmp_eval_recall
            eval_f1 += tmp_eval_f1

            # Track the number of batches
            nb_eval_steps += 1
        if print_full_log:
        # Report the final accuracy for this validation run.
            print("  Average training loss: {0:.2f}".format(avg_train_loss))
            print("  Loss: {0:.5f}".format(eval_loss / nb_eval_steps))
            print("  Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))
            print("  Precision: {0:.2f}".format(eval_precision / nb_eval_steps))
            print("  Recall: {0:.2f}".format(eval_recall / nb_eval_steps))
            print("  F1: {0:.2f}".format(eval_f1 / nb_eval_steps))
            print("  Validation took: {:}".format(format_time(time.time() - t0)))
    if not print_full_log:
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Loss: {0:.5f}".format(eval_loss / nb_eval_steps))
        print("  Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))
        print("  Precision: {0:.2f}".format(eval_precision / nb_eval_steps))
        print("  Recall: {0:.2f}".format(eval_recall / nb_eval_steps))
        print("  F1: {0:.2f}".format(eval_f1 / nb_eval_steps))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Training complete!")


In [None]:
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.metrics import precision_score,recall_score,f1_score

skf = StratifiedKFold(n_splits=4,shuffle=True)

for grade in ["grades_0","grades_1","grades_2","grades_3","grades_4"]:
    print(" CV for grade : " + grade)
    labels = train[grade].values
    for train_indexes , validation_indexes in skf.split(input_ids,labels):

        train_inputs = input_ids[train_indexes]
        validation_inputs = input_ids[validation_indexes]

        train_labels = labels[train_indexes]
        validation_labels = labels[validation_indexes]

        train_masks = get_attention_masks(train_inputs)
        validation_masks = get_attention_masks(validation_inputs)

        train_for_cv(train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks)


 CV for grade : grades_0

Training...

  Average training loss: 1.32
  Training epcoh took: 0:01:48

Running Validation...
  Average training loss: 1.32
  Loss: 1.26683
  Accuracy: 0.36
  Precision: 0.66
  Recall: 0.36
  F1: 0.44
  Validation took: 0:00:10

Training...

  Average training loss: 1.26
  Training epcoh took: 0:01:59

Running Validation...
  Average training loss: 1.26
  Loss: 1.24949
  Accuracy: 0.37
  Precision: 0.69
  Recall: 0.37
  F1: 0.46
  Validation took: 0:00:11

Training...

  Average training loss: 1.25
  Training epcoh took: 0:02:08

Running Validation...
  Average training loss: 1.25
  Loss: 1.24365
  Accuracy: 0.37
  Precision: 0.68
  Recall: 0.37
  F1: 0.45
  Validation took: 0:00:12

Training...

  Average training loss: 1.24
  Training epcoh took: 0:01:59

Running Validation...
  Average training loss: 1.24
  Loss: 1.23924
  Accuracy: 0.37
  Precision: 0.66
  Recall: 0.37
  F1: 0.45
  Validation took: 0:00:11

Training...

  Average training loss: 1.24
  T