Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License

# Distributed Training For BertAbs Abstractive Summarization on CNN/DM Dataset

## Summary
This notebook demonstrates how to use Azure Machine Learning to run distributed training using Distributed Data Parallel in Pytorch for abstractive summarization using BertAbs in [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345). 

## Prerequisites
If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, refer to the [Configuration Notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace. Prerequisites are:

- Azure subscription
- Azure Machine Learning Workspace
- Azure Machine Learning SDK

To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). 

## Import Libraries

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [1]:
import os
import sys
from tempfile import TemporaryDirectory
import torch

import azureml.core
from azureml.core import Experiment, Workspace, Run
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch
from azureml.train.dnn import Nccl
from azureml.widgets import RunDetails

nlp_path = os.path.abspath("../../")
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)
from utils_nlp.azureml.azureml_utils import get_or_create_workspace
from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset
from utils_nlp.models.transformers.datasets import SummarizationNonIterableDataset
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.85


## Configuration 

In [18]:
# for Azure ML Workspacen
SUBSRIPTION_ID = "YOUR_SUBSCRIPTION_ID"
LOCATION = "YOUR_RESOURCE_GROUP_NAME"  # example "eastus2"
RESOURCE_GROUP = "YOUR_WORKSPACE_NAME"  # modifiy to use your own
WORKSPACE_NAME = "YOUR_WORKSPACE_REGION"  # modifiy to use your own

SUBSRIPTION_ID = "9086b59a-02d7-4687-b3fd-e39fa5e0fd9b"
LOCATION = "eastus2"  # example "eastus2"
RESOURCE_GROUP = "daden1aml"  # modifiy to use your own
WORKSPACE_NAME = "daden1amlws"  # modifiy to use your own


# for creating Azure ML Compute Cluster
AMLCOMPUTE_CLUSTER_NAME = "bertabs1"  # modifiy to use your own
NODE_COUNT = 2
#VM_SIZE = "STANDARD_NC12"  # this should be the VM that's supported by Azure and Azure ML
VM_SIZE = "Standard NC12s_v3"

# for creating Azure ML Experiment
EXPERIMENT_NAME = "NLP-BertAbs"  # modifiy to use your own


# local folder to save the processed data
LOCAL_DATA_FOLDER = (
    "./bert_abs_data/"
)  # modify to use your own, the penultimate level folder should exist

# local cache folder to process data
LOCAL_CACHE_FOLDER = (
    "./cache/"
)  # modify to use your own, the penultimate level folder should exist


# Training related parameter
MODEL_NAME = "bert-base-uncased"  # limited choice

# folder in the workspace where the data is uploaded to
TARGET_DATA_FOLDER = "/bertabs_processed_data"  # modify to use your own
TARGET_OUTPUT_DIR = f"output/{EXPERIMENT_NAME}/"
# cache dir in the workspace
TARGET_CACHE_DIR = f"cache/{EXPERIMENT_NAME}/"
# file name for saving the prediction
SUMMARY_FILENAME = "generated_summaries.txt"
# file name for saving the trained model
MODEL_FILENAME = "dist_bertabs.pt"


# local path to download the output from the cluster
LOCAL_OUTPUT_DIR = "./output"  # modifiy to use your own, the penultimate level folder


# local folder to store all the related files to be copied to the workspace
PROJECT_FOLDER = "./azureml_exp"
# conda environment name, the yaml file will be copied to the workspace
CONDA_ENV_NAME = "nlp_gpu"

## Create an AML Workspace

In [3]:
# Create the workspace using the specified parameters
ws = get_or_create_workspace(
    workspace_name=WORKSPACE_NAME,
    subscription_id=SUBSRIPTION_ID,
    resource_group=RESOURCE_GROUP,
    workspace_region=LOCATION,
)

In [4]:
print(
    "Workspace name: " + ws.name,
    "Azure region: " + ws.location,
    "Subscription id: " + ws.subscription_id,
    "Resource group: " + ws.resource_group,
    sep="\n",
)

Workspace name: daden1amlws
Azure region: eastus2
Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b
Resource group: daden1aml


## Create an AML GPU Compute Cluster

In [5]:
try:
    gpu_compute_target = ComputeTarget(workspace=ws, name=AMLCOMPUTE_CLUSTER_NAME)
    print("Found existing compute target.")
except ComputeTargetException:
    print("Creating a new compute target...")
    compute_config = AmlCompute.provisioning_configuration(
        vm_size=VM_SIZE, max_nodes=NODE_COUNT, 
        idle_seconds_before_scaledown="PT1200S"
    )

    # create the cluster
    gpu_compute_target = ComputeTarget.create(
        ws, AMLCOMPUTE_CLUSTER_NAME, compute_config
    )

    gpu_compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current AmlCompute.
print(gpu_compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-21T17:58:05.717000+00:00', 'errors': None, 'creationTime': '2020-02-21T17:58:02.904515+00:00', 'modifiedTime': '2020-02-21T17:58:18.620746+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}


## Create an Experiment

In [6]:
experiment = Experiment(ws, name=EXPERIMENT_NAME)

## Preprocess Dataset

In [18]:
!mkdir -p {LOCAL_DATA_FOLDER}
!mkdir -p {LOCAL_CACHE_FOLDER}

In [21]:
TRAIN_FILENAME =  "train_abssum_dataset_full.pt"
TEST_FILENAME = "test_abssum_dataset_full.pt"
train_data_path = os.path.join(LOCAL_DATA_FOLDER, TRAIN_FILENAME)
test_data_path = os.path.join(LOCAL_DATA_FOLDER, TEST_FILENAME)

train_dataset, test_dataset = CNNDMSummarizationDataset(
            top_n=-1, local_cache_path=LOCAL_CACHE_FOLDER, prepare_extractive=False
        )
source = [x[0] for x in list(test_dataset.get_source())]
target = [x[0] for x in list(test_dataset.get_target())]
test_sum_dataset = SummarizationNonIterableDataset(source, target)

source = [x[0] for x in list(train_dataset.get_source())]
target = [x[0] for x in list(train_dataset.get_target())]
train_sum_dataset = SummarizationNonIterableDataset(source, target)


torch.save(train_sum_dataset, train_data_path)
torch.save(test_sum_dataset, test_data_path)


KeyboardInterrupt: 

## Upload the Downloaded Dataset to AML Workspace

In [10]:
ds = ws.get_default_datastore()

In [25]:
ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)

Uploading an estimated of 2 files
Uploading ./bert_abs_data/train_abssum_dataset_full.pt
Target already exists. Skipping upload for bertabs_processed_data/test_abssum_dataset_full.pt
Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 1 files out of an estimated total of 2
Uploaded 1 files


$AZUREML_DATAREFERENCE_fb1ab46646bf409496082009759b990b

In [28]:
TARGET_DATA_FOLDER

'/bertabs_processed_data'

## Prepare for the Experiment Run
Prepare the local project folder which is mirror to the workspace for the experiment

In [7]:
ENTRY_SCRIPT = "bertabs_cnndm_distributed_train.py"
!mkdir -p {PROJECT_FOLDER}
#!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}
#!cp ./nlp_gpu.yaml {PROJECT_FOLDER}
!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}
!cp -r ../../utils_nlp {PROJECT_FOLDER}

Generated conda file: nlp_gpu.yaml

To create the conda environment:
$ conda env create -f nlp_gpu.yaml

To update the conda environment:
$ conda env update -f nlp_gpu.yaml

To register the conda environment in Jupyter:
$ conda activate nlp_gpu
$ python -m ipykernel install --user --name nlp_gpu --display-name "Python (nlp_gpu)"



## Submit Run

In [8]:
os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(LOCAL_OUTPUT_DIR, EXPERIMENT_NAME), exist_ok=True)

In [16]:
NcclConfig=Nccl()
estimator = PyTorch(source_directory=PROJECT_FOLDER,
                    compute_target=gpu_compute_target,
                    script_params={
                        "--dist_url": "$AZ_BATCHAI_PYTORCH_INIT_METHOD",
                        "--rank": "$AZ_BATCHAI_TASK_INDEX",
                        "--node_count": NODE_COUNT,
                        "--data_dir":ds.path(f'{TARGET_DATA_FOLDER}').as_mount(),
                        "--cache_dir": ds.path(f'{TARGET_CACHE_DIR}').as_mount(),
                        '--output_dir':ds.path(f'{TARGET_OUTPUT_DIR}').as_mount(),
                        "--quick_run": 'true',
                        "--summary_filename": f'{SUMMARY_FILENAME}',
                        "--model_filename": f'{MODEL_FILENAME}',
                    },
                    entry_script= ENTRY_SCRIPT,
                    node_count=NODE_COUNT,
                    distributed_training=NcclConfig,
                    conda_dependencies_file=f'{CONDA_ENV_NAME}.yaml',
                    use_gpu=True)



In [17]:
run = experiment.submit(estimator)

In [15]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [None]:
"""
If you stop the notebook and come back, 
you'll need to use the run_id in the output of the previous cell 
to get run details.
"""
# fetched_run = Run(experiment, "NLP-ExtSum_1579816237_ea238f69")
# RunDetails(fetched_run).show()

## Download Generated Summaries 

In [25]:
# need to clear the local output dir as the ds.download won't download if the path exists
!rm -rf {LOCAL_OUTPUT_DIR}/* 

In [26]:
ds.download(target_path=LOCAL_OUTPUT_DIR,
                   prefix=f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}',
                   show_progress=True)

Downloading output/NLP-ExtSum/generated_summaries.txt
Downloaded output/NLP-ExtSum/generated_summaries.txt, 1 files out of an estimated total of 1


1

## Evaluation

In [27]:
from utils_nlp.eval.evaluate_summarization import get_rouge
from utils_nlp.models.transformers.extractive_summarization import ExtSumProcessedData
import pickle
from utils_nlp.models.transformers.extractive_summarization import ExtractiveSummarizer

In [28]:
train_dataset, test_dataset = ExtSumProcessedData().splits(root=LOCAL_DATA_FOLDER)

In [29]:
target = [i['tgt_txt'] for i in test_dataset]

In [30]:
prediction = []
with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), "r") as filehandle:
    for cnt, line in enumerate(filehandle):
        prediction.append(line[0:-1]) # remove the ending "\n"

In [36]:
## you can also download the saved model and run prediction if you are running the notebook on a gpu machine
#"""
ds.download(target_path=LOCAL_OUTPUT_DIR,
               prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',
               show_progress=True)
summarizer = ExtractiveSummarizer(MODEL_NAME, ENCODER, LOCAL_OUTPUT_DIR)
summarizer.model.load_state_dict(
    torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'))
)
prediction = summarizer.predict(test_dataset, num_gpus=torch.cuda.device_count(), batch_size=128)
#"""

Downloading output/NLP-ExtSum/dist_extsum.pt
Downloaded output/NLP-ExtSum/dist_extsum.pt, 1 files out of an estimated total of 1


100%|██████████| 546/546 [00:00<00:00, 306489.56B/s]
100%|██████████| 267967963/267967963 [00:04<00:00, 63548158.24B/s]
Scoring: 100%|██████████| 90/90 [00:41<00:00,  3.68it/s]


In [31]:
test_dataset[0]['src_txt']

['a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .',
 'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .',
 'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .',
 'andrew mogni , 20 , from glen ellyn , illinois , a university of iowa student has died nearly three months after a fall in rome in a suspected robbery',
 'he was taken to a medical facility in the chicago area , close to his family home in glen ellyn .',
 "he died on sunday at northwestern memorial hospital - medical examiner 's office spokesman frank shuftan says a cause of death wo n't be released until monday at the earliest .",
 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed .',
 "on sunday , his cousin abby wrote online : ` this morning my cous

In [32]:
prediction[0]

'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .<q>andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .<q>a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .'

In [33]:
target[0]

'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program when the incident happened in january<q>he was flown back to chicago via air on march 20 but he died on sunday<q>initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed<q>his cousin claims he was attacked and thrown 40ft from a bridge'

In [34]:
RESULT_DIR = TemporaryDirectory().name

In [40]:
rouge_score = get_rouge(prediction, target, RESULT_DIR)

11489
11489


2020-01-31 03:38:44,480 [MainThread  ] [INFO ]  Writing summaries.
2020-01-31 03:38:44,481 [MainThread  ] [INFO ]  Processing summaries. Saving system files to /tmp/tmpz9fasy6o/tmpnhtvedhc/system and model files to /tmp/tmpz9fasy6o/tmpnhtvedhc/model.
2020-01-31 03:38:44,481 [MainThread  ] [INFO ]  Processing files in /tmp/tmpz9fasy6o/rouge-tmp-2020-01-31-03-38-43/candidate/.
2020-01-31 03:38:45,679 [MainThread  ] [INFO ]  Saved processed files to /tmp/tmpz9fasy6o/tmpnhtvedhc/system.
2020-01-31 03:38:45,681 [MainThread  ] [INFO ]  Processing files in /tmp/tmpz9fasy6o/rouge-tmp-2020-01-31-03-38-43/reference/.
2020-01-31 03:38:46,904 [MainThread  ] [INFO ]  Saved processed files to /tmp/tmpz9fasy6o/tmpnhtvedhc/model.
2020-01-31 03:38:46,989 [MainThread  ] [INFO ]  Written ROUGE configuration to /tmp/tmpz9fasy6o/tmpicwmu3se/rouge_conf.xml
2020-01-31 03:38:46,989 [MainThread  ] [INFO ]  Running ROUGE with command /dadendev/pyrouge/tools/ROUGE-1.5.5/ROUGE-1.5.5.pl -e /dadendev/pyrouge/tools/

---------------------------------------------
1 ROUGE-1 Average_R: 0.52322 (95%-conf.int. 0.52036 - 0.52598)
1 ROUGE-1 Average_P: 0.35403 (95%-conf.int. 0.35171 - 0.35628)
1 ROUGE-1 Average_F: 0.40840 (95%-conf.int. 0.40623 - 0.41040)
---------------------------------------------
1 ROUGE-2 Average_R: 0.23066 (95%-conf.int. 0.22789 - 0.23341)
1 ROUGE-2 Average_P: 0.15558 (95%-conf.int. 0.15365 - 0.15744)
1 ROUGE-2 Average_F: 0.17947 (95%-conf.int. 0.17740 - 0.18147)
---------------------------------------------
1 ROUGE-L Average_R: 0.47554 (95%-conf.int. 0.47274 - 0.47834)
1 ROUGE-L Average_P: 0.32224 (95%-conf.int. 0.32001 - 0.32440)
1 ROUGE-L Average_F: 0.37150 (95%-conf.int. 0.36935 - 0.37361)



## Cleanup

In [45]:
import shutil
if os.path.exists(LOCAL_DATA_FOLDER):
    shutil.rmtree(LOCAL_DATA_FOLDER, ignore_errors=True)
if os.path.exists(LOCAL_OUTPUT_DIR):
    shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)
if os.path.exists(PROJECT_FOLDER):
    shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)
if os.path.exists(RESULT_DIR):
    shutil.rmtree(RESULT_DIR, ignore_errors=True)