# BERT Question Answering Bot


## Environment Setup

In [None]:
import tensorflow as tf
import random
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig, BertModel, AdamW, BertForQuestionAnswering
from tqdm import tqdm, trange
import pandas as pd
import io
import os
import numpy as np
import matplotlib.pyplot as plt
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification, BertTokenizer, BertConfig
from tqdm import tqdm, trange
from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
                         RawResultExtended, write_predictions_extended)
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad, plot_pr_curve
% matplotlib inline

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
# !pip install pytorch-pretrained-bert pytorch-nlp pytorch_transformers

## Import Data

In [None]:
input_file = '/data/train-v2.0.json'
examples_train = read_squad_examples(input_file=input_file,
                                is_training=True,
                                version_2_with_negative=True)

In [None]:
train_data = pd.DataFrame.from_records([vars(ex) for ex in examples_train])
train_data.head()

In [None]:
sample = train_data.sample(frac=1).head(1)
context = sample.doc_tokens.values
train_data[train_data.doc_tokens.values==context]

In [None]:
from utils_preprocess import print_squad_sample
print_squad_sample(train_data)

In [None]:
train_data['paragraph_len'] = train_data['doc_tokens'].apply(len)
train_data['question_len'] = train_data['question_text'].apply(len)
train_data.sample(frac=1).head(5)

## Preprocessing

In [None]:
# Specify tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
doc_stride = 128
max_seq_length = 256
max_query_length = 64
batch_size = 16

In [None]:
if not os.path.exists('cache'):
    os.makedirs('cache')
cached_features_file = '/cache/cache_train'

In [None]:
from utils_preprocess import create_features
features_train = create_features(cached_features_file = cached_features_file,
                           examples=examples_train,
                           tokenizer=tokenizer,
                           max_seq_length=max_seq_length,
                           doc_stride=doc_stride,
                           max_query_length=max_query_length,
                           is_training=True)


In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [None]:
# Build training data
from utils_preprocess import generate_bert_loader_train
train_dataloader, len_train = generate_bert_loader_train(features_train, batch_size=batch_size, drop_last=True)

## Fine-Tune Model

In [None]:
# Specify architecture for output layer and whether to freeze bert parameters
from models import bert_mod
bert = bert_mod('bert-base-uncased')
bert.freeze_param()
model = bert.bert_linear_linear()

# print out model architecture
model.cuda()

In [None]:
# Print out parameters for output layer
# param_optimizer = list(model.named_parameters())
# print(param_optimizer[-2])
# print(param_optimizer[-1])

In [None]:
global_step = 0
train_loss_set = []
tr_loss = 0.0
learning_rate = 5e-5
adam_epsilon=1e-8
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.1}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon,betas=(0.9,0.999))

In [None]:
num_train_epochs = 1

print("***** Running training *****")
print("  Num examples = %d" % len_train)
print("  Num Epochs = %d" % num_train_epochs)
print("  Batch size = %d" % batch_size)
print("  Total optimization steps = %d" % (len(train_dataloader) // num_train_epochs))

model.zero_grad()
train_iterator = trange(num_train_epochs, desc="Epoch")
set_seed()

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader,desc="iterations",disable=False,position=0, leave=True)
    for step, batch in enumerate(epoch_iterator):
        if step < int(global_step) + 1:
            continue

        model.train()
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids':       batch[0],
                'attention_mask':  batch[1], 
                'token_type_ids':  batch[2],  
                'start_positions': batch[3], 
                'end_positions':   batch[4]}

        outputs = model(**inputs)
        loss = outputs[0]
        train_loss_set.append(loss)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        tr_loss += loss.item()
        optimizer.step()
        model.zero_grad()
        global_step += 1


In [None]:
# Plot training loss
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

## Evaluate Result

In [None]:
path_val = '/data/dev-v2.0.json'
examples_val = read_squad_examples(input_file=path_val,
                                is_training=False,
                                version_2_with_negative=True)
doc_stride = 128
max_seq_length = 256
max_query_length = 64
cached_features_file = '/cache/cache_validation'

# Cache features
features_val = create_features(cached_features_file=cached_features_file,
                                    examples=examples_val,
                                    tokenizer=tokenizer,
                                    max_seq_length=max_seq_length,
                                    doc_stride=doc_stride,
                                    max_query_length=max_query_length,
                                    is_training=False)


In [None]:
from utils_preprocess import generate_bert_loader_validation
validation_dataloader, len_val = generate_bert_loader_validation(features_val, batch_size=batch_size, drop_last=True)

In [None]:
predict_file = '/data/dev-v2.0.json'
from evaluate import evaluate
results = evaluate(model, 
                   validation_dataloader, 
                   features_val,
                   examples_val,
                   tokenizer, 
                   path_val, 
                   len_val, 
                   batch_size,
                   device)

In [None]:
import json
results_json = []
for k in enumerate(results.keys()):
    result_dict = {k[1] : results[k[1]]}
    results_json.append(result_dict)
print(results_json)
with open('results.json', 'w') as f:
    json.dump(results_json, f)