<a href="https://colab.research.google.com/github/manishiitg/ML_Experiments/blob/master/squad_huggingface_experiment_with_Trainer_TPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sun Jun 14 06:27:17 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   72C    P0    34W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               


In [2]:
!git clone https://github.com/huggingface/transformers.git
!pip install -U ./transformers

Cloning into 'transformers'...
remote: Enumerating objects: 29234, done.[K
remote: Total 29234 (delta 0), reused 0 (delta 0), pack-reused 29234[K
Receiving objects: 100% (29234/29234), 26.53 MiB | 36.52 MiB/s, done.
Resolving deltas: 100% (20256/20256), done.
Processing ./transformers
Collecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp37-cp37m-manylinux1_x86_64.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 3.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 22.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 23.8 MB/s 
Building wheels for collected packages: transformers, sacremoses
  Building wheel for transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for transformers: filename=transformers-2.11.0-py3-none-any.whl size=688546 sha256=70176578

# Download SQUAD v2 Dataset

In [3]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

--2020-06-14 05:50:51--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.108.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2020-06-14 05:50:54 (18.8 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]

--2020-06-14 05:50:54--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.108.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘dev-v2.0.json’


2020-06-14 05:50:54 (6.53 MB/s) - ‘dev-v2.0.json’ saved [4370528/4370528]



In [0]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np

from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_output_modes,
    glue_tasks_num_labels,
    set_seed,
)


from transformers import (
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
    PreTrainedTokenizer,
    DataCollator
)
from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor, SquadExample, SquadFeatures

from torch.utils.data.dataset import Dataset
from typing import List, Optional, Union
from enum import Enum
from filelock import FileLock
import time
import torch

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm.auto import tqdm
import timeit


class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"


In [0]:
logger = logging.getLogger(__name__)

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
    model_type: str = field(
        metadata={"help": "type of model"}
    )
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )
    model_checkpoint: Optional[str] = field(
        default=None, metadata={"help": "Which checkout do you want to resume check point"}
    )


# Copy your check points to google drive.

Useful if your training stops in between in colab and you need to resume again

In [0]:
# !rm -rf cache
# !rm -rf output

# from google.colab import drive
# drive.mount('/content/drive')

# !cp -rf output/distilbert/distilbert-base-uncased /content/drive/My\ Drive/

# !cp -rf output/distilbert//content/output/distilbert/distilbert-base-uncased/checkpoint-52500/* /content/drive/My\ Drive/distilbert-base-uncased

# !mkdir output
# !mkdir output/distilbert
# !cp -rf /content/drive/My\ Drive/distilbert-base-uncased output/distilbert/

# Define your training configuration

In [0]:
# model_name = 'bert'
# model_name_or_path = 'bert-base-uncased'

# model_name = 'distilbert'
# model_name_or_path = 'distilbert-base-uncased' 

model_name = 'roberta'
model_name_or_path = 'distilroberta-base'

model_checkpoint = './output/' + model_name + '/' + model_name_or_path + '/checkpoint-52500'


base_dir = "./drive/" # on google gcp instance else set this to "./" on colab

config = {
    "model_type" : model_name,
    "model_name_or_path" : model_name_or_path,
    "output_dir": base_dir + "output/" + model_name + "/" + model_name_or_path,
    "model_checkpoint" : base_dir + model_checkpoint,
    "do_train" : True,
    "do_eval" : True,
    "do_predict" : False,
    "data_dir" : "",
    "overwrite_output_dir" : True,
    "overwrite_cache" : False,
    "cache_dir" : base_dir + "cache",
    "limit_length": None,
    "max_seq_length": 328,
    "doc_stride" : 128,
    "train_batch_size" : 8,
    "per_device_train_batch_size" : 48,
    "save_steps": 5000,
    "save_total_limit": 2,
}

import json
with open('args.json', 'w') as f:
    json.dump(config, f)

# Data Training Arguments

In [0]:
@dataclass
class SquadDataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """


    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
    )
    max_seq_length: int = field(
        default=328,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    doc_stride: int = field(
        default=128,
        metadata={
            "help": "When splitting up a long document into chunks, how much stride to take between chunks."
        },
    )
    max_query_length: int = field(
        default=64,
        metadata={
            "help": "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets."}
    )
    threads: int = field(
        default=1, metadata={"help": "multiple threads for converting example to features."}
    )
    version_2_with_negative : bool = field(
        default=True, metadata={"help": "If true, the SQuAD examples contain some that do not have an answer."}
    )
    train_file: str = field(
        default='train-v2.0.json',
        metadata={"help": "name of training file"}
    )
    predict_file: str = field(
        default='dev-v2.0.json',
        metadata={"help": "name of dev file"}
    )
    do_lower_case: bool = field(
        default=False, metadata={"help": "do lower case"}
    )
    limit_length: int = field(
        default=None, metadata={"help": "length of dataset to process optional"}
    )
    n_best_size: int = field(
        default=20,  metadata={"help": "The total number of n-best predictions to generate in the nbest_predictions.json output file."}
    )
    max_answer_length:int = field(
        default=30, metadata={"help": "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another."}
    )
    verbose_logging: bool = field(
        default=False, metadata={"help": "If true, all of the warnings related to data processing will be printed."}
    )
    lang_id:int = field(
        default=0, metadata={"help": "language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"}
    )
    null_score_diff_threshold: float = field(
        default=0.0, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
    )

    def __post_init__(self):
        self.task_name = "squad"

# Squad Custom Dataset

In [0]:
from torch.utils.data import TensorDataset

class SquadCustomDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """

    args: SquadDataTrainingArguments
    output_mode: str
    features: List[SquadFeatures]
    examples: List[SquadExample]
    dataset: TensorDataset

    def __init__(
        self,
        args: SquadDataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        limit_length: Optional[int] = None,
        mode: Union[str, Split] = Split.train,
    ):
        self.args = args
        self.processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
        self.output_mode = "" #need to see for this
        if isinstance(mode, str):
            try:
                mode = Split[mode]
            except KeyError:
                raise KeyError("mode is not a valid split name")
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.data_dir,
            "cached_{}_{}_{}_{}_{}".format(
                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,limit_length
            ),
        )
        
        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
                obj = torch.load(cached_features_file)
                self.features = obj["features"]
                self.examples = obj["examples"]
                # self.dataset = obj["dataset"]


                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else:
                logger.info(f"Creating features from dataset file at {args.data_dir}")

                if mode == Split.dev:
                    examples = self.processor.get_dev_examples(args.data_dir, filename=args.predict_file)
                elif mode == Split.test:
                    examples = self.processor.get_test_examples(args.data_dir, filename=args.predict_file)
                else:
                    examples = self.processor.get_train_examples(args.data_dir, filename=args.train_file)

                if limit_length is not None:
                    logger.info(
                        "limit data to length %s ", limit_length
                    )
                    examples = examples[:limit_length]

                self.examples = examples

                self.features, self.dataset = squad_convert_examples_to_features(
                    examples=examples,
                    tokenizer=tokenizer,
                    max_seq_length=args.max_seq_length,
                    doc_stride=args.doc_stride,
                    max_query_length=args.max_query_length,
                    is_training= mode == Split.test,
                    return_dataset="pt",
                    threads=args.threads,
                )
                
                start = time.time()
                torch.save({
                    "examples" : self.examples,
                    "features" : self.features,
                    # "dataset" : self.dataset
                }, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> SquadFeatures:
        return self.features[i]

    def get_features(self):
      return self.features

    def get_examples(self):
      return self.examples

In [0]:
from torch import nn

# not used
@dataclass
class PassDataCollator(DataCollator):
  def collate_batch(self, batch: List) -> Dict[str, torch.Tensor]:
    return batch 

@dataclass
class SquadDataCollator(DataCollator):

    model_args: ModelArguments

    def __init__(
        self,
        model_args: ModelArguments):
      
      self.model_args = model_args


    def collate_batch(self, features: List) -> Dict[str, torch.Tensor]:
        # taken from https://github.com/huggingface/transformers/blob/5daca95dddf940139d749b1ca42c59ebc5191979/src/transformers/data/processors/squad.py#L325
        # and https://github.com/huggingface/transformers/blob/5daca95dddf940139d749b1ca42c59ebc5191979/src/transformers/data/processors/squad.py#L325
        # first = features[0]

        

        # for f in features:
        #   logger.warning("unique id collator %s", f.unique_id)
        #   logger.warning("example_index collator %s", f.example_index)

        unique_ids = torch.tensor([f.unique_id for f in features])

        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

        

        # if hasattr(first, "start_position") and first.start_position is not None:
        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

        inputs = {
            "input_ids" : all_input_ids,
            "attention_mask" : all_attention_masks,
            "token_type_ids" : all_token_type_ids,
            "start_positions" : all_start_positions,
            "end_positions" : all_end_positions,
            "cls_index": all_cls_index,
            "p_mask": all_p_mask,
            "example_index" : all_example_index,
            "unique_ids" : unique_ids
        }

        # if self.model_args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
        #     del inputs["token_type_ids"]

        # if self.model_args.model_type in ["xlnet", "xlm"]:
        #     # inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
        #     # if args.version_2_with_negative:
        #     #     inputs.update({"is_impossible": batch[7]})
        #     # if hasattr(model, "config") and hasattr(model.config, "lang2id"):
        #     #     inputs.update(
        #     #         {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
        #     #     )
        # else:
        #   del inputs["cls_index"]
        #   del inputs["p_mask"]

        return inputs
        
class SquadTrainer(Trainer):
    model_args: ModelArguments
    def __init__(self, model_args, **kwargs):
        self.model_args = model_args
        super().__init__(**kwargs)

    def _training_step(
        self, model: nn.Module, inputs: Dict[str, torch.Tensor], optimizer: torch.optim.Optimizer
    ) -> float:
        model.train()
        for k, v in inputs.items():
          inputs[k] = v.to(self.args.device)

        del inputs["unique_ids"]
        del inputs["example_index"]

        # this can be handled at dataset level as well. 
        # no need to extend SquadTrainer

        if self.model_args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
            del inputs["token_type_ids"]

        if self.model_args.model_type in ["xlnet", "xlm"]:
            inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
            if args.version_2_with_negative:
                inputs.update({"is_impossible": batch[7]})
            if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                inputs.update(
                    {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                )
        else:
          del inputs["cls_index"]
          del inputs["p_mask"]
        

        outputs = model(**inputs)
        loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

        if self.args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training
        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        return loss.item()


def to_list(tensor):
    return tensor.detach().cpu().tolist()

# main()

In [0]:
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.


    parser = HfArgumentParser((ModelArguments, SquadDataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_json_file(json_file="args.json")

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )

    if data_args.doc_stride >= data_args.max_seq_length - data_args.max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )

    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir if model_args.cache_dir else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        do_lower_case=data_args.do_lower_case,
        cache_dir=model_args.cache_dir if model_args.cache_dir else None,
        use_fast=False
    )
    model = AutoModelForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir if model_args.cache_dir else None,
    )

    # Get datasets
    train_dataset = SquadCustomDataset(data_args, tokenizer=tokenizer, limit_length=data_args.limit_length) if training_args.do_train else None
    eval_dataset = SquadCustomDataset(data_args, tokenizer=tokenizer, mode="dev", limit_length=data_args.limit_length) if training_args.do_eval else None
    
    # Initialize our Trainer
    trainer = SquadTrainer(
        model_args=model_args,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=None,
        data_collator=SquadDataCollator(model_args),
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_checkpoint if os.path.isdir(model_args.model_checkpoint) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation

    ## unable to figure out evalution with Trainer due to distrbuted eval. 
    ## will come back to it later on
    eval_results = {}
    all_results = []
    results = {}
    start_time = timeit.default_timer()
    prefix = ''
    features = eval_dataset.get_features()
    if training_args.do_eval:
        logger.warning("*** Evaluate ***")

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=training_args.eval_batch_size, collate_fn=SquadDataCollator(model_args).collate_batch)

        for inputs in tqdm(eval_dataloader, desc="Evaluating"):
          model.eval()

          with torch.no_grad():
              for k, v in inputs.items():
                inputs[k] = v.to(training_args.device)

              if model_args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
                  del inputs["token_type_ids"]

              feature_indices = inputs["example_index"]
              unique_ids = inputs["unique_ids"]

              del inputs["example_index"]
              del inputs["start_positions"]
              del inputs['end_positions']
              del inputs['unique_ids']

              # XLNet and XLM use more arguments for their predictions
              if model_args.model_type in ["xlnet", "xlm"]:
                  # inputs.update({"cls_index": inputs["cls_index"], "p_mask": inputs["p_mask"]})
                  # for lang_id-sensitive xlm models
                  if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                      inputs.update(
                          {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * model_args.lang_id).to(model_args.device)}
                      )
              else:
                del inputs["cls_index"]
                del inputs["p_mask"]

              outputs = model(**inputs)

              unique_ids = unique_ids.cpu().numpy()
              for i,unique_id in enumerate(unique_ids):
                  # TODO: i and feature_index are the same number! Simplify by removing enumerate?
                  # logger.warning("feature index %s" , feature_index.item())
                  # eval_feature = features[feature_index.item()]
                  # unique_id = int(eval_feature.unique_id)
                  # logger.warn("unique id %s" ,unique_id)
                  output = [to_list(output[i]) for output in outputs]

                  

                  # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                  # models only use two.
                  if len(output) >= 5:
                      start_logits = output[0]
                      start_top_index = output[1]
                      end_logits = output[2]
                      end_top_index = output[3]
                      cls_logits = output[4]

                      result = SquadResult(
                          unique_id,
                          start_logits,
                          end_logits,
                          start_top_index=start_top_index,
                          end_top_index=end_top_index,
                          cls_logits=cls_logits,
                      )

                  else:
                      start_logits, end_logits = output
                      result = SquadResult(unique_id, start_logits, end_logits)

                  all_results.append(result)

        evalTime = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset))

        # Compute predictions
        output_prediction_file = os.path.join(training_args.output_dir, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(training_args.output_dir, "nbest_predictions_{}.json".format(prefix))

        if data_args.version_2_with_negative:
            output_null_log_odds_file = os.path.join(training_args.output_dir, "null_odds_{}.json".format(prefix))
        else:
            output_null_log_odds_file = None


        examples = eval_dataset.get_examples()
        
        # XLNet and XLM use a more complex post-processing procedure
        if model_args.model_type in ["xlnet", "xlm"]:
            start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
            end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                examples,
                features,
                all_results,
                data_args.n_best_size,
                data_args.max_answer_length,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                start_n_top,
                end_n_top,
                data_args.version_2_with_negative,
                tokenizer,
                data_args.verbose_logging,
            )
        else:
            predictions = compute_predictions_logits(
                examples,
                features,
                all_results,
                data_args.n_best_size,
                data_args.max_answer_length,
                data_args.do_lower_case,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                data_args.verbose_logging,
                data_args.version_2_with_negative,
                data_args.null_score_diff_threshold,
                tokenizer,
            )

        # Compute the F1 and exact scores.
        result = squad_evaluate(examples, predictions)

        logger.info(result)

    return eval_results

def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()

In [9]:
main()   #via gpu or cpu

06/14/2020 06:29:07 - INFO - transformers.training_args -   PyTorch: setting up devices
06/14/2020 06:29:08 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='./drive/output/roberta/distilroberta-base', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=False, per_device_train_batch_size=48, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Jun14_06-29-07_torchvm3', logging_first_step=False, logging_steps=500, save_steps=5000, save_total_limit=2, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, dataloader_drop_last=False)
06/14/2020 06:29:09 - INFO - transformers.configuration_utils -   loading configuration file https://s3.

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1473.0, style=ProgressStyle(description_w…



{"loss": 0.10346589727760147, "learning_rate": 4.434261145055443e-05, "epoch": 0.3394433129667346, "step": 500}
{"loss": 3.783510135690449e-05, "learning_rate": 3.8685222901108846e-05, "epoch": 0.6788866259334692, "step": 1000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1473.0, style=ProgressStyle(description_w…

{"loss": 2.2400965321139666e-05, "learning_rate": 3.3027834351663274e-05, "epoch": 1.0183299389002036, "step": 1500}
{"loss": 1.4963840394557337e-05, "learning_rate": 2.73704458022177e-05, "epoch": 1.3577732518669383, "step": 2000}
{"loss": 1.1106098689197097e-05, "learning_rate": 2.171305725277212e-05, "epoch": 1.6972165648336728, "step": 2500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1473.0, style=ProgressStyle(description_w…

{"loss": 8.760819851886481e-06, "learning_rate": 1.6055668703326546e-05, "epoch": 2.0366598778004072, "step": 3000}
{"loss": 7.3004911382668065e-06, "learning_rate": 1.0398280153880969e-05, "epoch": 2.3761031907671417, "step": 3500}
{"loss": 6.398871559213149e-06, "learning_rate": 4.740891604435393e-06, "epoch": 2.7155465037338766, "step": 4000}


06/14/2020 08:34:50 - INFO - transformers.trainer -   

Training completed. Do not forget to share your model on huggingface.co/models =)


06/14/2020 08:34:50 - INFO - transformers.trainer -   Saving model checkpoint to ./drive/output/roberta/distilroberta-base
06/14/2020 08:34:50 - INFO - transformers.configuration_utils -   Configuration saved in ./drive/output/roberta/distilroberta-base/config.json






06/14/2020 08:34:51 - INFO - transformers.modeling_utils -   Model weights saved in ./drive/output/roberta/distilroberta-base/pytorch_model.bin


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=833.0, style=ProgressStyle(description_w…

06/14/2020 08:37:14 - INFO - __main__ -     Evaluation done in total 142.852460 secs (0.010724 sec per example)
06/14/2020 08:37:14 - INFO - transformers.data.metrics.squad_metrics -   Writing predictions to: ./drive/output/roberta/distilroberta-base/predictions_.json
06/14/2020 08:37:14 - INFO - transformers.data.metrics.squad_metrics -   Writing nbest to: ./drive/output/roberta/distilroberta-base/nbest_predictions_.json
06/14/2020 08:37:14 - INFO - transformers.data.metrics.squad_metrics -   Writing null_log_odds to: ./drive/output/roberta/distilroberta-base/null_odds_.json





06/14/2020 08:37:37 - INFO - __main__ -   OrderedDict([('exact', 50.07159100480081), ('f1', 50.07159100480081), ('total', 11873), ('HasAns_exact', 0.0), ('HasAns_f1', 0.0), ('HasAns_total', 5928), ('NoAns_exact', 100.0), ('NoAns_f1', 100.0), ('NoAns_total', 5945), ('best_exact', 50.07159100480081), ('best_exact_thresh', 0.0), ('best_f1', 50.07159100480081), ('best_f1_thresh', 0.0)])


{}

# Publish Your Model To Huggingface

In [0]:
!transformers-cli login

In [0]:
!transformers-cli upload ./drive/output/roberta/distilroberta-base

2020-06-13 15:50:22.603803: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
About to upload file [1m/content/distilbert-squad-256seq-8batch-test/vocab.txt[0m to S3 under filename [1mdistilbert-squad-256seq-8batch-test/vocab.txt[0m and namespace [1mmanishiitg[0m
About to upload file [1m/content/distilbert-squad-256seq-8batch-test/predictions_.json[0m to S3 under filename [1mdistilbert-squad-256seq-8batch-test/predictions_.json[0m and namespace [1mmanishiitg[0m
About to upload file [1m/content/distilbert-squad-256seq-8batch-test/null_odds_.json[0m to S3 under filename [1mdistilbert-squad-256seq-8batch-test/null_odds_.json[0m and namespace [1mmanishiitg[0m
About to upload file [1m/content/distilbert-squad-256seq-8batch-test/tokenizer_config.json[0m to S3 under filename [1mdistilbert-squad-256seq-8batch-test/tokenizer_config.json[0m and namespace [1mmanishiitg[0m
About to upload file [1m/content/

# Do Prediction

In [10]:
tokenizer = AutoTokenizer.from_pretrained('output/roberta/distilroberta-base')
model = AutoModelForQuestionAnswering.from_pretrained('output/roberta/distilroberta-base')

model.eval()


06/14/2020 08:38:57 - INFO - transformers.configuration_utils -   loading configuration file output/roberta/distilroberta-base/config.json
06/14/2020 08:38:57 - INFO - transformers.configuration_utils -   Model config RobertaConfig {
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

06/14/2020 08:38:57 - INFO - transformers.tokenization_utils -   Model name 'output/roberta/distilroberta-base' not found in model shortcut name list (roberta-base, roberta-large, roberta-large-mnli, distilroberta-base, roberta-base-openai-detector, roberta-large-openai-detecto

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [11]:
with torch.no_grad():
  input_ids = torch.tensor(tokenizer.encode("how much work experiance?", "I have been working with excellence technologies for 10 years now and before that i worked for 2years with headstrong ltd"))  # Batch size 1
  outputs = model(input_ids.unsqueeze(0))
  start_logits = outputs[0]
  end_logits = outputs[1]
  
  # start_idx = torch.argmax(start_logits)
  # end_idx = torch.argmax(end_logits) + 1
  # print(start_idx)
  # print(end_idx)

  all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
  answer = ' '.join(all_tokens[torch.argmax(start_logits) : torch.argmax(end_logits)+1])
  print(answer)


<s>


In [0]:
# Crash on purpose to get more ram : doens't work anymore 
# import torch
# torch.tensor([10.]*10000000000)

In [0]:
VERSION = "nightly"  #@param ["1.5" , "20200325", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

In [0]:
!pip install tf-nightly

In [0]:
import tensorflow as tf

import os
import tensorflow_datasets as tfds

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

In [0]:
import torch_xla.distributed.xla_multiprocessing as xmp
xmp.spawn(_mp_fn, args=(), nprocs=8, start_method='fork')



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




100%|██████████| 442/442 [00:50<00:00,  8.76it/s]
convert squad examples to features: 100%|██████████| 10000/10000 [01:05<00:00, 152.35it/s]
add example index and unique id: 100%|██████████| 10000/10000 [00:00<00:00, 479809.65it/s]
100%|██████████| 35/35 [00:08<00:00,  4.08it/s]
convert squad examples to features: 100%|██████████| 10000/10000 [01:46<00:00, 93.87it/s]
add example index and unique id: 100%|██████████| 10000/10000 [00:00<00:00, 442432.46it/s]


Exception: ignored