# Distributed Question Answering on the SQuAD Dataset

## Prerequisites
* Check out the nlp-recipes repo.   
`git clone git@github.com:microsoft/nlp-recipes.git`
* Swicth to the branch with distributed training utilities and notebook.   
`git checkout hlu/distributed_training_utils_and_notebook`
* Follow the instructions [here](https://github.com/microsoft/nlp-recipes/blob/master/SETUP.md) to set up the python environment.
* Optionally, to run Horovod within a local Docker container, follow the instructions [here](https://github.com/horovod/horovod/blob/master/docs/docker.rst) to prepare the Docker environment.

## Single VM, Multi-GPU

### DistributedDataParallel

**Key code changes vs. DataParallel**
```python
def main():
    ngpus_per_node = torch.cuda.device_count()

    MODEL_NAME = args.model_name
    qa_extractor = AnswerExtractor(model_name=MODEL_NAME, cache_dir=args.cache_dir)

    mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, qa_extractor, args))


def main_worker(local_rank, ngpus_per_node, qa_extractor, args):
    rank = args.rank * ngpus_per_node + local_rank
    world_size = args.node_count * ngpus_per_node

    torch.distributed.init_process_group(
        backend="nccl",
        init_method=args.dist_url,
        world_size=world_size,
        rank=rank
    )

    ...

    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)

    model = torch.nn.parallel.DistributedDataParallel(
        model,
        device_ids=[local_rank],
        find_unused_parameters=True,
    )
```

**Execute the following command from shell**
```
python ./examples/question_answering/distributed_question_answering_squad_transformers_DDP.py --quick_run True
```

### Horovod

**Key code changes vs. DataParallel**
```python
import horovod.torch as hvd

hvd.init()

rank = hvd.rank()
local_rank = hvd.local_rank()
world_size = hvd.size()

...

train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)

optimizer = hvd.DistributedOptimizer(
    optimizer,
    named_parameters=model.named_parameters(),
    backward_passes_per_step=gradient_accumulation_steps,
)

hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

```

**Execute the following command from shell within the Docker container**
```
mpirun -np 4 \
    ./examples/question_answering/distributed_question_answering_squad_transformers_HVD.py \
    --quick_run True
```

## Multi-node, multi-GPU AMLCompute

### Import local modules

In [3]:
import os
import sys

nlp_path = os.path.abspath('../../')
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)
    
from utils_nlp.azureml import azureml_utils
from utils_nlp.dataset.squad import load_pandas_df

from azureml.core import Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch
from azureml.widgets import RunDetails

# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.69


### Configurations

In [4]:
AMLCOMPUTE_CLUSTER_NAME = "qanc24rsv3-2"
NODE_COUNT = 2
VM_SIZE = 'STANDARD_NC24rs_v3'

AZUREML_CONFIG_PATH = "./.azureml"
DATA_FOLDER = './squad'
PROJECT_FOLDER = './azureml_exp'
EXPERIMENT_NAME = 'NLP-QA'

os.makedirs(PROJECT_FOLDER, exist_ok=True)

### Prepare the Azure Machine Learning workspace

**Note**: Horovod is not a general requirement of the NLP recipes repo. Please manualy add `- horovod>=0.16.1` at the end of the `nlp_gpu.yaml` file. 

In [13]:
!cp distributed_question_answering_squad_transformers_HVD.py './azureml_exp'
!cp distributed_question_answering_squad_transformers_DDP.py './azureml_exp'
!cp -r ../../utils_nlp './azureml_exp'
!cp ../../tools/nlp_gpu.yaml './azureml_exp'

In [5]:
ws = azureml_utils.get_or_create_workspace(
    config_path=AZUREML_CONFIG_PATH,
    subscription_id="<subscription id>",
    resource_group="<reource group>",
    workspace_name="<workspace name>",
    workspace_region="<workspace region>",
)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

### Download data to local and upload to workspace datastore

In [26]:
train_df = load_pandas_df(local_cache_path=DATA_FOLDER, squad_version="v1.1", file_split="train")
dev_df = load_pandas_df(local_cache_path=DATA_FOLDER, squad_version="v1.1", file_split="dev")

In [6]:
ds = ws.get_default_datastore()

In [17]:
ds.upload(src_dir=DATA_FOLDER, target_path=DATA_FOLDER)

### Create the AMLCompute cluster

In [7]:
try:
    gpu_compute_target = ComputeTarget(workspace=ws, name=AMLCOMPUTE_CLUSTER_NAME)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size=VM_SIZE,
                                                           max_nodes=NODE_COUNT)

    # create the cluster
    gpu_compute_target = ComputeTarget.create(ws, AMLCOMPUTE_CLUSTER_NAME, compute_config)

    gpu_compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current AmlCompute. 
print(gpu_compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-10-30T16:09:55.440000+00:00', 'errors': None, 'creationTime': '2019-10-28T20:33:25.147673+00:00', 'modifiedTime': '2019-10-28T20:33:41.337052+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}


### DistributedDataParallel on Multi-node AML Compute

In [8]:
from azureml.train.dnn import Nccl

In [23]:
NcclConfig=Nccl()
ENTRY_SCRIPT = 'distributed_question_answering_squad_transformers_DDP.py'
estimator = PyTorch(source_directory=PROJECT_FOLDER,
                    compute_target=gpu_compute_target,
                    script_params={
                        "--dist_url": "$AZ_BATCHAI_PYTORCH_INIT_METHOD",
#                         "--dist_url": "$AZ_BATCH_MASTER_NODE",
                        "--rank": "$AZ_BATCHAI_TASK_INDEX",
                        "--node_count": NODE_COUNT,
                        "--cache_dir": ds.path('squad/').as_mount(),
                        "--model_name": "bert-large-cased-whole-word-masking",
                        "--do_lower_case": False,
                        "--quick_run": True
                    },
                    entry_script= ENTRY_SCRIPT,
                    node_count=NODE_COUNT,
                    distributed_training=NcclConfig,
                    conda_dependencies_file="nlp_gpu.yaml",
                    use_gpu=True)

In [14]:
experiment = Experiment(ws, name=EXPERIMENT_NAME)

In [15]:
run = experiment.submit(estimator)
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

### Horovod on Multi-node AML Compute

In [17]:
from azureml.core.runconfig import MpiConfiguration

In [28]:
mpiConfig=MpiConfiguration()
mpiConfig.process_count_per_node=4
ENTRY_SCRIPT = 'distributed_question_answering_squad_transformers_HVD.py'

estimator = PyTorch(source_directory=PROJECT_FOLDER,
                    compute_target=gpu_compute_target,
                    script_params = {
                        "--cache_dir": ds.path('squad/').as_mount(),
                        "--model_name": "bert-large-cased-whole-word-masking",
                        "--do_lower_case": False,
                        "--gradient_accumulation_steps":8,
                        "--quick_run": True

                    },
                    custom_docker_image='mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda9.0-cudnn7-ubuntu16.04',
                    entry_script=ENTRY_SCRIPT,
                    node_count=NODE_COUNT,
                    distributed_training=mpiConfig,
                    conda_dependencies_file="nlp_gpu.yaml",
                    framework_version='1.1',
                    use_gpu=True)


In [19]:
experiment = Experiment(ws, name=EXPERIMENT_NAME)

In [21]:
run = experiment.submit(estimator)
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…