# Distributed Question Answering on the SQuAD Dataset using BERT

## Single VM, Multi-GPU

### DistributedDataParallel

Execute the following command from shell
```
/data/anaconda/envs/nlp_gpu/bin/python -m torch.distributed.launch \
    --nproc_per_node=1 \
    --nnodes=1 \
    ./examples/question_answering/distributed_question_answering_squad_transformers_DDP.py
```

### Horovod

## Multi-node AMLCompute

In [1]:
import os
import sys

nlp_path = os.path.abspath('../../')
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)
    
from utils_nlp.azureml import azureml_utils
from utils_nlp.dataset.squad import load_pandas_df

from azureml.core import Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch
from azureml.widgets import RunDetails

# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.69


In [2]:
AMLCOMPUTE_CLUSTER_NAME = "qanc24rsv3-4"
NODE_COUNT = 4
VM_SIZE = 'STANDARD_NC24rs_v3'

AZUREML_CONFIG_PATH = "./.azureml"
DATA_FOLDER = './squad'
PROJECT_FOLDER = './azureml_exp'
EXPERIMENT_NAME = 'NLP-QA'

os.makedirs(PROJECT_FOLDER, exist_ok=True)

Prepare the Azure Machine Learning workspace

In [13]:
# !cp distributed_question_answering_squad_transformers.py './azureml_exp'
!cp -r ../../utils_nlp './azureml_exp'

In [None]:
# !cp ../../tools/nlp_gpu.yaml './azureml_exp'

In [3]:
ws = azureml_utils.get_or_create_workspace(
    config_path=AZUREML_CONFIG_PATH,
    subscription_id="<subscription id>",
    resource_group="<reource group>",
    workspace_name="<workspace name>",
    workspace_region="<workspace region>",
)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: hlu_cc_ws
Azure region: canadacentral
Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b
Resource group: hlu_cc_rg


Download the data to local and upload to workspace datastore

In [4]:
train_df = load_pandas_df(local_cache_path=DATA_FOLDER, squad_version="v1.1", file_split="train")
dev_df = load_pandas_df(local_cache_path=DATA_FOLDER, squad_version="v1.1", file_split="dev")

In [4]:
ds = ws.get_default_datastore()

In [17]:
ds.upload(src_dir=DATA_FOLDER, target_path=DATA_FOLDER)

Create the AMLCompute cluster

In [5]:
try:
    gpu_compute_target = ComputeTarget(workspace=ws, name=AMLCOMPUTE_CLUSTER_NAME)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size=VM_SIZE,
                                                           max_nodes=NODE_COUNT)

    # create the cluster
    gpu_compute_target = ComputeTarget.create(ws, AMLCOMPUTE_CLUSTER_NAME, compute_config)

    gpu_compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current AmlCompute. 
print(gpu_compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-11-01T01:47:31.284000+00:00', 'errors': None, 'creationTime': '2019-10-30T19:13:05.937472+00:00', 'modifiedTime': '2019-10-30T19:13:22.101092+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}


### DistributedDataParallel

In [11]:
from azureml.train.dnn import Nccl

In [21]:
NcclConfig=Nccl()
ENTRY_SCRIPT = 'distributed_question_answering_squad_transformers_DDP.py'
estimator = PyTorch(source_directory=PROJECT_FOLDER,
                    compute_target=gpu_compute_target,
                    script_params={"--dist_url": "$AZ_BATCH_MASTER_NODE",
                                   "--rank": "$AZ_BATCHAI_TASK_INDEX",
                                   "--node_count": 2,
                                   "--cache_dir": ds.path('squad/').as_mount(),
                                   "--horovod_dist": False, 
                                   "--model_name": "bert-large-cased-whole-word-masking",
                                   "--do_lower_case": False,
                                   "--quick_run":True},
                    entry_script= ENTRY_SCRIPT,
                    node_count=2,
                    distributed_training=NcclConfig,
                    conda_dependencies_file="nlp_gpu.yaml",
                    use_gpu=True)



In [45]:
experiment = Experiment(ws, name=EXPERIMENT_NAME)

In [22]:
run = experiment.submit(estimator)
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [14]:
run.cancel()

### Horovod

In [23]:
from azureml.core.runconfig import MpiConfiguration

In [24]:
mpiConfig=MpiConfiguration()
mpiConfig.process_count_per_node=4
ENTRY_SCRIPT = 'distributed_question_answering_squad_transformers.py'

estimator = PyTorch(source_directory=PROJECT_FOLDER,
                    compute_target=gpu_compute_target,
                    script_params = {
                        "--cache_dir": ds.path('squad/').as_mount(),
                        "--horovod_dist": True, 
                        "--model_name": "bert-large-cased-whole-word-masking",
                        "--do_lower_case": False,
                        "--gradient_accumulation_steps":8
                    },
                    custom_docker_image='mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda9.0-cudnn7-ubuntu16.04',
                    entry_script=ENTRY_SCRIPT,
                    node_count=NODE_COUNT,
                    distributed_training=mpiConfig,
                    conda_dependencies_file="nlp_gpu.yaml",
                    framework_version='1.1',
                    use_gpu=True)




In [8]:
experiment = Experiment(ws, name=EXPERIMENT_NAME)

In [25]:
run = experiment.submit(estimator)
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [10]:
run.cancel()