In [None]:
!cp -r /content/drive/MyDrive/data_json .

In [None]:
!cp -r /content/drive/MyDrive/code .

In [None]:
!pip install sentencepiece



In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for transformers (PEP 517) ... [?25l[?25hdone


In [None]:
import torch

In [None]:
torch.__version__

'1.8.1+cu101'

In [None]:
import io
import os
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
# from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          XLNetConfig,
                          XLNetTokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          XLNetForSequenceClassification)

In [None]:
import json

In [None]:
# Set seed for reproducibility.
set_seed(123)

# Number of training epochs (authors on fine-tuning Bert recommend between 2 and 4).
epochs = 4

# Number of batches - depending on the max sequence length and GPU memory.
# For 512 sequence length batch of 10 works without cuda memory issues.
# For small sequence length can try batch of 32 or higher.
batch_size = 8

# Pad or truncate text sequences to a specific length
# if `None` it will use maximum sequence of word piece tokens allowed by model.
max_length = 128 # 770 for concat

# Look for gpu to use. Will use `cpu` by default if no gpu found.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Name of transformers model - will use already pretrained model.
# Path of transformer model - will load your own model from local disk.
model_name_or_path = 'xlnet-base-cased'

# Dictionary of labels and their id - this will be used to convert.
# String labels to number ids.
with open('data_json/label_dict.json','r') as f:
  labels_ids = json.load(f)

# How many labels are we using in training.
# This is used to decide size of classification head.
n_labels = len(labels_ids)

# learning rate for the optimizer
lr_param = 1e-5

In [None]:
import sys
sys.path.append('code')

In [None]:
from SharedTaskDataset import SharedTaskDataset
from Gpt2ClassificationCollator import Gpt2ClassificationCollator

In [None]:
def train(dataloader, valid_dataloader, optimizer_, scheduler_, device_, eval_every=500):
    """
    Train pytorch model on a single pass through the data loader.

    It will use the global variable `model` which is the transformer model 
    loaded on `_device` that we want to train on.

    This function is built with reusability in mind: it can be used as is as long
      as the `dataloader` outputs a batch in dictionary format that can be passed 
      straight into the model - `model(**batch)`.

    Arguments:

        dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):
            Parsed data into batches of tensors.

        optimizer_ (:obj:`transformers.optimization.AdamW`):
            Optimizer used for training.

        scheduler_ (:obj:`torch.optim.lr_scheduler.LambdaLR`):
            PyTorch scheduler.

        device_ (:obj:`torch.device`):
            Device used to load tensors before feeding to model.

    Returns:

        :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted
          Labels, Train Average Loss].
    """

    # Use global variable for model.
    global model
    global best_acc

    # Tracking variables.
    predictions_labels = []
    true_labels = []
    # Total loss for this epoch.
    total_loss = 0

    # Put the model into training mode.
    model.train()

    n_iter = 0

    # For each batch of training data...
    for batch in tqdm(dataloader, total=len(dataloader)):
        n_iter += 1

        # Add original labels - use later for evaluation.
        true_labels += batch['labels'].numpy().flatten().tolist()
        
        # move batch to device
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
        
        # Always clear any previously calculated gradients before performing a
        # backward pass.
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this a bert model function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(**batch)

        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple along with the logits. We will use logits
        # later to calculate training accuracy.
        loss, logits = outputs[:2]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()

        # Convert these logits to list of predicted labels values.
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

        if n_iter % eval_every == 0:
            # Get prediction form model on validation data. 
            print('Validation on batches...')
            valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
            val_acc = accuracy_score(valid_labels, valid_predict)
            val_f1 = f1_score(valid_labels, valid_predict)
            print("step: %d   val_loss: %.5f - valid_acc: %.5f"%(n_iter, val_loss, val_acc))

            if val_acc >= best_acc:
                model.save_pretrained("XLNet/lr_{}_best.pt".format(lr_param))
                best_acc = val_acc
                print("model saved   best_valid_acc: %.5f"%(val_acc))

    # Calculate the average loss over the training data.
    avg_epoch_loss = total_loss / len(dataloader)
    
    # Return all true labels and prediction for future evaluations.
    return true_labels, predictions_labels, avg_epoch_loss

In [None]:
# Get model configuration.
print('Loading configuraiton...')
model_config = XLNetConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
model = XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...
Loading tokenizer...
Loading model...


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Model loaded to `cuda`


In [None]:
def validation(dataloader, device_):
    """Validation function to evaluate model performance on a 
    separate set of data.

    This function will return the true and predicted labels so we can use later
    to evaluate the model's performance.

    This function is built with reusability in mind: it can be used as is as long
      as the `dataloader` outputs a batch in dictionary format that can be passed 
      straight into the model - `model(**batch)`.

    Arguments:

      dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):
            Parsed data into batches of tensors.

      device_ (:obj:`torch.device`):
            Device used to load tensors before feeding to model.

    Returns:
      
      :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted
          Labels, Train Average Loss]
    """

    # Use global variable for model.
    global model

    # Tracking variables
    predictions_labels = []
    true_labels = []
    #total loss for this epoch.
    total_loss = 0

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Evaluate data for one epoch
    for batch in tqdm(dataloader, total=len(dataloader)):

        # add original labels
        true_labels += batch['labels'].numpy().flatten().tolist()

        # move batch to device
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(**batch)

            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple along with the logits. We will use logits
            # later to to calculate training accuracy.
            loss, logits = outputs[:2]
            
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()
            
            # get predicitons to list
            predict_content = logits.argmax(axis=-1).flatten().tolist()

            # update list
            predictions_labels += predict_content

    # Calculate the average loss over the training data.
    avg_epoch_loss = total_loss / len(dataloader)

    # Return all true labels and prediciton for future evaluations.
    return true_labels, predictions_labels, avg_epoch_loss

In [None]:
# Create data collator to encode text and labels into numbers.
gpt2_classification_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer, 
                                                          max_sequence_len=max_length)


print('Dealing with Train...')
# Create pytorch dataset.
train_dataset = SharedTaskDataset(path='data_json',
                                  split='train')
print('Created `train_dataset` with %d examples!'%len(train_dataset))

# Move pytorch dataset into dataloader.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classification_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

print('Dealing with Validation...')
# Create pytorch dataset.
valid_dataset =  SharedTaskDataset(path='data_json',
                                   split='dev')
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

# Move pytorch dataset into dataloader.
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classification_collator)
print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))

Dealing with Train...
Created `train_dataset` with 39187 examples!
Created `train_dataloader` with 4899 batches!

Dealing with Validation...
Created `valid_dataset` with 3264 examples!
Created `eval_dataloader` with 408 batches!


In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = lr_param, # default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # default is 1e-8.
                  )

# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives 
# us the number of batches.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Store the average loss after each epoch so we can plot them.
all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}
all_f1 = {'train_f1':[], 'val_f1':[]}


In [None]:
# # Loop through each epoch.
# print('Epoch')
# for epoch in tqdm(range(epochs)):
#     print()
#     print('Training {} on batches...'.format(epoch))
#     # Perform one full pass over the training set.
#     train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
#     train_acc = accuracy_score(train_labels, train_predict)
#     train_f1 = f1_score(train_labels, train_predict)

#     # Get prediction form model on validation data. 
#     print('Validation on batches...')
#     valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
#     val_acc = accuracy_score(valid_labels, valid_predict)
#     val_f1 = f1_score(valid_labels, valid_predict)

#     # Print loss and accuracy values to see how training evolves.
#     print("epoch: %d  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(epoch+1, train_loss, val_loss, train_acc, val_acc))
#     print("epoch: %d  train_f1: %.5f - val_f1: %.5f -"%(epoch+1, train_f1, val_f1))
#     print()

#     # Store the loss value for plotting the learning curve.
#     all_loss['train_loss'].append(train_loss)
#     all_loss['val_loss'].append(val_loss)
#     all_acc['train_acc'].append(train_acc)
#     all_acc['val_acc'].append(val_acc)
#     all_f1['train_f1'].append(train_f1)
#     all_f1['val_f1'].append(val_f1)

#     model.save_pretrained("XLNet/lr_{}_epoch{}.pt".format(lr_param, epoch+1))


In [None]:
# Loop through each epoch.
best_acc = 0.0
print('Epoch')
for epoch in tqdm(range(epochs)):
    print()
    print('Training {} on batches...'.format(epoch))
    # Perform one full pass over the training set.
    train_labels, train_predict, train_loss = train(train_dataloader, valid_dataloader, optimizer, scheduler, device)
    train_acc = accuracy_score(train_labels, train_predict)
    train_f1 = f1_score(train_labels, train_predict)

    # Get prediction form model on validation data. 
    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_acc = accuracy_score(valid_labels, valid_predict)
    val_f1 = f1_score(valid_labels, valid_predict)

    # Print loss and accuracy values to see how training evolves.
    print("epoch: %d  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(epoch+1, train_loss, val_loss, train_acc, val_acc))
    print("epoch: %d  train_f1: %.5f - val_f1: %.5f -"%(epoch+1, train_f1, val_f1))
    print()

    # Store the loss value for plotting the learning curve.
    all_loss['train_loss'].append(train_loss)
    all_loss['val_loss'].append(val_loss)
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)
    all_f1['train_f1'].append(train_f1)
    all_f1['val_f1'].append(val_f1)
    if val_acc >= best_acc:
        model.save_pretrained("XLNet/lr_{}_best.pt".format(lr_param))
        best_acc = val_acc
        print("model saved   best_valid_acc: %.5f"%(val_acc))


Epoch


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Training 0 on batches...


HBox(children=(FloatProgress(value=0.0, max=4899.0), HTML(value='')))

Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 500   val_loss: 0.67573 - valid_acc: 0.57812
model saved   best_valid_acc: 0.57812
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 1000   val_loss: 0.64358 - valid_acc: 0.62531
model saved   best_valid_acc: 0.62531
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 1500   val_loss: 0.61214 - valid_acc: 0.66391
model saved   best_valid_acc: 0.66391
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 2000   val_loss: 0.64418 - valid_acc: 0.61887
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 2500   val_loss: 0.61180 - valid_acc: 0.64890
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 3000   val_loss: 0.59022 - valid_acc: 0.67525
model saved   best_valid_acc: 0.67525
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 3500   val_loss: 0.63495 - valid_acc: 0.66912
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 4000   val_loss: 0.59264 - valid_acc: 0.67034
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 4500   val_loss: 0.58808 - valid_acc: 0.67463

Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


epoch: 1  train_loss: 0.63057 - val_loss: 0.59186 - train_acc: 0.63365 - valid_acc: 0.67341
epoch: 1  train_f1: 0.64458 - val_f1: 0.64842 -


Training 1 on batches...


HBox(children=(FloatProgress(value=0.0, max=4899.0), HTML(value='')))

Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 500   val_loss: 0.65786 - valid_acc: 0.65993
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 1000   val_loss: 0.59651 - valid_acc: 0.68750
model saved   best_valid_acc: 0.68750
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 1500   val_loss: 0.59008 - valid_acc: 0.68842
model saved   best_valid_acc: 0.68842
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 2000   val_loss: 0.63142 - valid_acc: 0.66697
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 2500   val_loss: 0.58968 - valid_acc: 0.67923
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 3000   val_loss: 0.58557 - valid_acc: 0.68076
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 3500   val_loss: 0.60582 - valid_acc: 0.68290
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 4000   val_loss: 0.60676 - valid_acc: 0.67831
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 4500   val_loss: 0.61667 - valid_acc: 0.68290

Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


epoch: 2  train_loss: 0.53792 - val_loss: 0.58085 - train_acc: 0.72289 - valid_acc: 0.68045
epoch: 2  train_f1: 0.72272 - val_f1: 0.67212 -


Training 2 on batches...


HBox(children=(FloatProgress(value=0.0, max=4899.0), HTML(value='')))

Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 500   val_loss: 0.79639 - valid_acc: 0.65931
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 1000   val_loss: 0.74533 - valid_acc: 0.67249
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 1500   val_loss: 0.82576 - valid_acc: 0.67463
Validation on batches...


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))


step: 2000   val_loss: 0.85931 - valid_acc: 0.67616


KeyboardInterrupt: ignored

In [None]:
!cp -r XLNet/* /content/drive/MyDrive/XLNet-best/

cp: target '/content/drive/MyDrive/XLNet-best/' is not a directory
