In [2]:
from fastai.collab import *
from fastai.tabular import *
from google.colab import drive
from google.colab import files
import sys

#
drive.mount('/content/gdrive', force_remount=True)

root_dir = "/content/gdrive/My Drive/ml_projects/ml_optimisation/bert_nlp"

Mounted at /content/gdrive


In [0]:
# installs the necessary modules
!pip install pytorch-pretrained-bert

!pip install spacy ftfy==4.4.3
!python -m spacy download en

In [0]:
# !pip uninstall apex
# !rm -rf apex

# #
# !git clone https://github.com/NVIDIA/apex.git
# %cd apex
# !python setup.py install --cuda_ext --cpp_ext

In [0]:
#
!git clone https://github.com/NVIDIA/apex.git
%cd apex
!python setup.py install --cuda_ext --cpp_ext

#
!pip install -e "/content/apex" --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext"

In [0]:
from __future__ import absolute_import, division, print_function

# import argparse
# import parser
import collections
import json
import logging
import math
import os
import random
import sys
from io import open

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                  BertTokenizer,
                                                  whitespace_tokenize)

import apex
import pickle

logger = logging.getLogger(__name__)

In [0]:
# enables inline plotting
%reload_ext autoreload
%autoreload 2

In [0]:
#
sys.path.append('/content/gdrive/My Drive/ml_projects/ml_optimisation/bert_nlp/squad_support')

In [0]:
# imports the squad_helper helper functions
import squad_helper
from squad_helper import *

In [11]:
# defines the squad_dir directory
squad_dir = Path(root_dir + '/squad_dir/')
squad_dir

PosixPath('/content/gdrive/My Drive/ml_projects/ml_optimisation/bert_nlp/squad_dir')

In [12]:
# defines the output_dir directory
output_dir = Path(root_dir + '/output_dir/')
output_dir

PosixPath('/content/gdrive/My Drive/ml_projects/ml_optimisation/bert_nlp/output_dir')

## Setup

In [0]:
server_ip = ''

server_port = ''

brt_model = 'bert-base-uncased'

do_train = False

do_predict = True

lower_case = True

train_file = squad_dir/'train-v1.1.json'
predict_file = squad_dir/'dev-v1.1.json'

train_batch_size = 12

learning_rate = 3e-5

num_train_epochs = 2.0

max_seq_length = 384

doc_stride = 128

local_rank = -1

seed = 42

gradient_accumulation_steps = 1

output_dir = output_dir

fp16 = True

no_cuda = False

loss_scale = 0

warmup_proportion = 0.1

# max_query_length = 64
max_query_length = 64

version_2_with_negative = True

predict_batch_size = 8

# default
n_best_size = 20

# default
do_lower_case = True

# threshold
null_score_diff_threshold = 0
max_answer_length = 30 

verbose_logging = True

##

In [0]:
if server_ip and server_port:

  # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
  
  import ptvsd
  
  print("Waiting for debugger attach")
  
  ptvsd.enable_attach(address=(server_ip, server_port), redirect_output=True)
  
  ptvsd.wait_for_attach()

    
    
if local_rank == -1 or no_cuda:

  device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
  
  n_gpu = torch.cuda.device_count()
  
else:

  torch.cuda.set_device(local_rank)
  
  device = torch.device("cuda", local_rank)
  
  n_gpu = 1
  
  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
  
  torch.distributed.init_process_group(backend='nccl')

  
  logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if local_rank in [-1, 0] else logging.WARN)

    
    
  logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
  
      device, n_gpu, bool(local_rank != -1), fp16))

    
    
  if gradient_accumulation_steps < 1:
    raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(gradient_accumulation_steps))

    
  train_batch_size = train_batch_size // gradient_accumulation_steps

    
  random.seed(seed)
  
  np.random.seed(seed)
    
  torch.manual_seed(seed)
    
    
  if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    
  if not do_train and not do_predict:
        raise ValueError("At least one of `do_train` or `do_predict` must be True.")

  if do_train:
    if not train_file:
      raise ValueError(
      
          "If `do_train` is True, then `train_file` must be specified.")
    
  if do_predict:  
      if not predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")

  if os.path.exists(output_dir) and os.listdir(output_dir) and do_train:
        raise ValueError("Output directory () already exists and is not empty.")
  
  if not os.path.exists(output_dir):
        os.makedirs(output_dir)

##

In [15]:
#
tokenizer = BertTokenizer.from_pretrained(brt_model, do_lower_case = lower_case)

train_examples = None

num_train_optimization_steps = None

#
if do_train:
  train_examples = read_squad_examples(
      
      input_file = train_file, 
      is_training = True,
      version_2_with_negative = False
  )
        
  # 
  num_train_optimization_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps) * num_train_epochs
        
  if local_rank != -1:
    num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

100%|██████████| 231508/231508 [00:00<00:00, 5819757.69B/s]


In [16]:
cache_dir = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                                                                 'distributed_{}'.format(
                                                                            local_rank)
                                                                        )
                                                 
cache_dir

'/root/.pytorch_pretrained_bert/distributed_-1'

## Model preparation

In [17]:
#
model = BertForQuestionAnswering.from_pretrained(brt_model)
# model = BertForQuestionAnswering.from_pretrained(model,
#                                                  cache_dir
#                                                  )

#
if fp16:
    model.half()
    model.to(device)

    #
    if local_rank != -1:

        try:
            from apex.parallel import DistributedDataParallel as DDP

        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)

    elif n_gpu > 1:

        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if do_train:

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        
        #
        optimizer_grouped_parameters = [
               {'params': [p for n, p in param_optimizer if not any(
                    nd in n for nd in no_decay)], 'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if any(
                    nd in n for nd in no_decay)], 'weight_decay': 0.0}
               ]

        if fp16:    
            try:
                
                from apex.optimizers import FP16_Optimizer
                
                from apex.optimizers import FusedAdam
                
                
            except ImportError:
              
              raise ImportError(
                  "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

                
            optimizer = FusedAdam(optimizer_grouped_parameters,
                                      lr = learning_rate,
                                      bias_correction = False,
                                      max_grad_norm = 1.0
                                 )
                
              
            if loss_scale == 0:
                
                #
                optimizer = FP16_Optimizer(
                        optimizer, dynamic_loss_scale = True)
                
            else:    
                optimizer = FP16_Optimizer(
                    optimizer,
                    static_loss_scale = loss_scale)
                
            warmup_linear = WarmupLinearSchedule(warmup = warmup_proportion,
                                                     t_total = num_train_optimization_steps)
            
        else:  
          optimizer = BertAdam(optimizer_grouped_parameters,
                                     lr = learning_rate,
                                     warmup = warmup_proportion,
                                     t_total = num_train_optimization_steps
                                    )


100%|██████████| 407873900/407873900 [00:06<00:00, 65070243.47B/s]


##

In [18]:
cached_train_features_file = str(train_file) + '_{0}_{1}_{2}_{3}'.format(
        
        list(filter(None, brt_model.split('/'))).pop(), str(max_seq_length), str(doc_stride), str(max_query_length))

cached_train_features_file

'/content/gdrive/My Drive/ml_projects/ml_optimisation/bert_nlp/squad_dir/train-v1.1.json_bert-base-uncased_384_128_64'

##

In [0]:
global_step = 0

if do_train:
  
    cached_train_features_file = str(train_file) + '_{0}_{1}_{2}_{3}'.format(
    
#     cached_train_features_file = train_file + '_{0}_{1}_{2}_{3}'.format(
        
        list(filter(None, brt_model.split('/'))).pop(), str(max_seq_length), str(doc_stride), str(max_query_length))
    
     
    train_features = None
        
    try:
    
      with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
            
    except:
            
        train_features = convert_examples_to_features(
                
            examples = train_examples,
                    
            tokenizer = tokenizer,
                    
            max_seq_length = max_seq_length,
                    
            doc_stride = doc_stride,
                
            max_query_length = max_query_length,
                    
            is_training = True
            
        )
            
            
        if local_rank == -1 or torch.distributed.get_rank() == 0:
        
          logger.info("  Saving train features into cached file %s", cached_train_features_file)
                
          with open(cached_train_features_file, "wb") as writer:
                  
              pickle.dump(train_features, writer)
              
        
    logger.info("***** Running training *****")
    
    logger.info("  Num orig examples = %d", len(train_examples))
    
    logger.info("  Num split examples = %d", len(train_features))
    
    logger.info("  Batch size = %d", train_batch_size)
    
    logger.info("  Num steps = %d", num_train_optimization_steps)
    
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    
    all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
    
    all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
    
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                   all_start_positions, all_end_positions)
    
    if local_rank == -1:
            
        train_sampler = RandomSampler(train_data)
        
    else:
        
        train_sampler = DistributedSampler(train_data)
        
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = train_batch_size)
    
    
    #
    model.train()
    
    for _ in trange(int(num_train_epochs), desc="Epoch"):
        
    
      for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable = local_rank not in [-1, 0])):
            
                if n_gpu == 1:
                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
                  
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                
                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                
                if n_gpu > 1:
                  
                    loss = loss.mean() # mean() to average on multi-gpu.
                    
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                if fp16:
                    optimizer.backward(loss)
                    
                else:
                    loss.backward()
                    
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used and handles this automatically
                        lr_this_step = learning_rate * warmup_linear.get_lr(global_step / num_train_optimization_steps,
                                                                                 warmup_proportion)
                        
                        for param_group in optimizer.param_groups:
                          
                            param_group['lr'] = lr_this_step
                            
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

##

In [20]:
if do_train and (local_rank == -1 or torch.distributed.get_rank() == 0):
        
    # Save a trained model, configuration and tokenizer
    
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

    
    # If we save using the predefined names, we can load using `from_pretrained`
    
    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    
    output_config_file = os.path.join(output_dir, CONFIG_NAME)

    
    torch.save(model_to_save.state_dict(), output_model_file)
    
    model_to_save.config.to_json_file(output_config_file)
    
    tokenizer.save_vocabulary(output_dir)

    
    # Load a trained model and vocabulary that you have fine-tuned
    
    model = BertForQuestionAnswering.from_pretrained(output_dir)
    
    tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case = do_lower_case)
    
else:
        
    model = BertForQuestionAnswering.from_pretrained(brt_model)

    model.to(device)

    if do_predict and (local_rank == -1 or torch.distributed.get_rank() == 0):
      
        eval_examples = read_squad_examples(
            
            input_file = predict_file, is_training = False, version_2_with_negative = version_2_with_negative)
        
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length = max_seq_length,
            doc_stride = doc_stride,
            max_query_length = max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        
        logger.info("  Num orig examples = %d", len(eval_examples))
        
        logger.info("  Num split examples = %d", len(eval_features))
        
        logger.info("  Batch size = %d", predict_batch_size)
        

        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
        
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size = predict_batch_size)

        model.eval()
        
        all_results = []
        
        logger.info("Start evaluating")
        
        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable = local_rank not in [-1, 0]):
            
            if len(all_results) % 1000 == 0:
            
              logger.info("Processing example: %d" % (len(all_results)))
            
            input_ids = input_ids.to(device)
            
            input_mask = input_mask.to(device)
            
            segment_ids = segment_ids.to(device)
            
            with torch.no_grad():
              
                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
            
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                
                eval_feature = eval_features[example_index.item()]
                
                unique_id = int(eval_feature.unique_id)
                
                all_results.append(RawResult(unique_id = unique_id,
                                             start_logits = start_logits,
                                             end_logits = end_logits))
        
        output_prediction_file = os.path.join(output_dir, "predictions.json")
        
        output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
        
        output_null_log_odds_file = os.path.join(output_dir, "null_odds.json")
        
        write_predictions(eval_examples, eval_features, all_results,
                          
                          n_best_size, max_answer_length,
                          
                          do_lower_case, output_prediction_file,
                          
                          output_nbest_file, output_null_log_odds_file, verbose_logging,
                          
                          
                          version_2_with_negative, null_score_diff_threshold
                         )

Evaluating: 100%|██████████| 1355/1355 [05:11<00:00,  4.35it/s]
