In [1]:
import pandas as pd
import torch

#read data from csv
df=pd.read_csv('data/train.csv')
input_text = df.text.values
keywords = df.keyword.values
input_str = [' '.join([str(keywords[i]),sen]) for i,sen in enumerate(input_text)]
input_labels = df.target.values

from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from utils import padding

#preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lowercase=True)
input_ids = [tokenizer.encode(sen,add_special_tokens=True) for sen in input_str]
input_ids = padding(input_ids)
att_masks = [[int(ids>0) for ids in sen] for sen in input_ids]
train_ids, val_ids, train_labels, val_labels, train_masks, val_masks = train_test_split(input_ids, input_labels, att_masks, test_size=0.1, random_state = 42)

#convert to pytorch tensors
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

from torch.utils.data import DataLoader,SequentialSampler,RandomSampler,TensorDataset

#wrap into datasets
train_data = TensorDataset(train_ids, train_masks, train_labels)
val_data = TensorDataset(val_ids, val_masks, val_labels)

batch_size = 16

#prepare dataloaders
train_sampler = RandomSampler(train_data)
val_sampler = SequentialSampler(val_data)
train_loader = DataLoader(train_data,sampler = train_sampler, batch_size = batch_size)
val_loader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)

In [3]:
from model import BertPooled, BertClassifier

#initiate bert model
modelpooled = BertPooled.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2,
    output_attentions = False, 
    output_hidden_states = False, 
)
#classifier has three linear layers. see model.py
classifier = BertClassifier.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2,
    output_attentions = False, 
    output_hidden_states = False, 
)

if torch.cuda.is_available():
    modelpooled.cuda()
    classifier.cuda()
    device=torch.device('cuda')
else:
    device=torch.device('cpu')

In [4]:
#total number of parameters in model
sum([p.numel() for p in modelpooled.parameters()])

109482240

In [5]:
from transformers import AdamW,get_linear_schedule_with_warmup
epochs = 4
total_steps = len(train_loader) * epochs
optimizer = AdamW(list(modelpooled.parameters())+list(classifier.parameters()),lr = 2e-5, eps = 1e-8)
# scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)
scheduler =  torch.optim.lr_scheduler.ExponentialLR(optimizer,gamma=0.3)

In [6]:
from utils import format_time

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def validate(modelpooled,classifier, val_loader, device):
    #Validation
    print("")
    print("Running Validation...")

    t0 = time.time()

    modelpooled.eval()
    classifier.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in val_loader:

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = modelpooled(b_input_ids, 
                                  token_type_ids=None,
                                  attention_mask=b_input_mask)[0]
            outputs = classifier(outputs)

        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    
    return eval_accuracy/nb_eval_steps

In [7]:
import random
import numpy as np
import time

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0
    modelpooled.train()
    classifier.train()

    for step, batch in enumerate(train_loader):

        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
#         model.zero_grad()  
        modelpooled.zero_grad()
        classifier.zero_grad()

        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        
        outputs = modelpooled(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)[0]
        outputs = classifier(outputs,labels=b_labels)


#         outputs = model(b_input_ids, 
#                     token_type_ids=None, 
#                     attention_mask=b_input_mask, 
#                     labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(modelpooled.parameters(), 1.0)
        torch.nn.utils.clip_grad_norm_(classifier.parameters(), 1.0)

        optimizer.step()
#         scheduler.step()

    avg_train_loss = total_loss / len(train_loader)            

    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
    
    validate(modelpooled,classifier, val_loader, device)
    scheduler.step()

print("")
print("Training complete!")


Training...
  Batch    40  of    429.    Elapsed: 0:00:21.
  Batch    80  of    429.    Elapsed: 0:00:42.
  Batch   120  of    429.    Elapsed: 0:01:03.
  Batch   160  of    429.    Elapsed: 0:01:25.
  Batch   200  of    429.    Elapsed: 0:01:46.
  Batch   240  of    429.    Elapsed: 0:02:08.
  Batch   280  of    429.    Elapsed: 0:02:30.
  Batch   320  of    429.    Elapsed: 0:02:51.
  Batch   360  of    429.    Elapsed: 0:03:13.
  Batch   400  of    429.    Elapsed: 0:03:35.

  Average training loss: 0.51
  Training epoch took: 0:03:51

Running Validation...
  Accuracy: 0.83
  Validation took: 0:00:06

Training...
  Batch    40  of    429.    Elapsed: 0:00:22.
  Batch    80  of    429.    Elapsed: 0:00:44.
  Batch   120  of    429.    Elapsed: 0:01:06.
  Batch   160  of    429.    Elapsed: 0:01:28.
  Batch   200  of    429.    Elapsed: 0:01:50.
  Batch   240  of    429.    Elapsed: 0:02:12.
  Batch   280  of    429.    Elapsed: 0:02:34.
  Batch   320  of    429.    Elapsed: 0:02:56.

In [8]:
#save
modelpooled.save_pretrained('save/modelpooled/')
classifier.save_pretrained('save/classifier/')
tokenizer.save_pretrained('save/')
torch.save(optimizer.state_dict(),'save/optim.bin')

In [5]:
#load
modelpooled = BertPooled.from_pretrained('save/modelpooled/').to(device)
classifier = BertClassifier.from_pretrained('save/classifier/').to(device)
tokenizer = BertTokenizer.from_pretrained('save/')

In [14]:
#load optimizer
checkpoint= torch.load('save/optim.bin')
optimizer.load_state_dict(checkpoint)

In [9]:
#process training data with the trained bert
pretrain_sampler = SequentialSampler(train_data)
pretrain_loader = DataLoader(train_data,sampler = pretrain_sampler, batch_size = batch_size)

In [10]:
from tqdm import tqdm_notebook
bert_output = []
labels=[]
for batch in tqdm_notebook(pretrain_loader):

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = modelpooled(b_input_ids, 
                                  token_type_ids=None,
                                  attention_mask=b_input_mask)[0]
            bert_output.append(outputs.cpu().numpy())
            labels.append(b_labels.cpu().numpy())

HBox(children=(IntProgress(value=0, max=429), HTML(value='')))




In [11]:
#convert to numpy tensors
bert_np = np.concatenate(bert_output)
labels_np = np.concatenate(labels)

In [12]:
bert_np.shape

(6851, 768)

In [13]:
labels_np.shape

(6851,)

In [14]:
np.save('bert_np',bert_np)
np.save('labels_np',labels_np)

In [15]:
#process validation data with the trained bert
bert_output = []
labels=[]
for batch in tqdm_notebook(val_loader):

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = modelpooled(b_input_ids, 
                                  token_type_ids=None,
                                  attention_mask=b_input_mask)[0]
            bert_output.append(outputs.cpu().numpy())
            labels.append(b_labels.cpu().numpy())

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))




In [16]:
#convert to numpy tensors
val_bert_np = np.concatenate(bert_output)
val_labels_np = np.concatenate(labels)

In [17]:
np.save('val_bert_np',val_bert_np)
np.save('val_labels_np',val_labels_np)

In [18]:
#use xgboost to classify output of BERT
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(bert_np,labels_np)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [19]:
#get accuracy of 0.84 again
y_pred = model.predict(val_bert_np)
predictions = [round(value) for value in y_pred]
from sklearn.metrics import accuracy_score
accuracy_score(list(val_labels_np),predictions)

0.84251968503937