In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # default is ‘last_expr’

%load_ext autoreload
%autoreload 2

In [2]:
import azure.batch
azure.batch.__version__

'10.0.0'

In [3]:
import os

from azure.batch import BatchServiceClient
from azure.batch.batch_auth import SharedKeyCredentials
from azure.batch.models import PoolAddParameter, BatchErrorException, VirtualMachineConfiguration, ImageReference, JobAddParameter,\
TaskAddParameter, PoolInformation
from azure.common.credentials import ServicePrincipalCredentials

# Azure Batch

Documentation
- https://github.com/azurebigcompute/Recipes/tree/master/Azure%20Batch/CustomImages
- https://github.com/Azure-Samples/batch-python-quickstart/blob/master/src/python_quickstart_client.py#L343
- https://github.com/Azure-Samples/azure-batch-samples/tree/master/Python/Batch

TODO

- Turn `enable_auto_scale` on and set the appropriate `auto_scale_formula`. This way we can cap the maximum available nodes. https://docs.microsoft.com/en-us/azure/batch/batch-automatic-scaling

## Create a pool for each instance of the API

Listing all versions of a SKU of image:
```
az vm image list --all --publisher microsoft-dsvm
```

In [4]:
os.environ['BATCH_ACCOUNT_NAME'] = ''
os.environ['BATCH_ACCOUNT_URL'] = 'https://'

os.environ['APP_CLIENT_ID'] = ''
os.environ['APP_CLIENT_SECRET'] = ''
os.environ['APP_TENANT_ID'] = ''

In [5]:
POOL_ID = 'internal_1'
assert len(POOL_ID) <= 64, 'pool_id has more than 64 characters'

POOL_NODE_COUNT = 1

POOL_VM_SIZE = 'Standard_NC6s_v3'  # https://docs.microsoft.com/en-us/azure/virtual-machines/ncv3-series

In [6]:
def print_batch_exception(batch_exception):
    """
    Prints the contents of the specified Batch exception.
    """
    print('-------------------------------------------')
    print('Exception encountered:')
    if batch_exception.error and \
            batch_exception.error.message and \
            batch_exception.error.message.value:
        print(batch_exception.error.message.value)
        if batch_exception.error.values:
            print()
            for mesg in batch_exception.error.values:
                print(f'{mesg.key}:\t{mesg.value}')
    print('-------------------------------------------')
    
def create_pool(batch_service_client, pool_id):
    """
    Found the DSVM image among the supported images using `az batch pool supported-images list`
    {
        "batchSupportEndOfLife": null,
        "capabilities": [
          "NvidiaTeslaDriverInstalled"
        ],
        "imageReference": {
          "offer": "ubuntu-1804",
          "publisher": "microsoft-dsvm",
          "sku": "1804",
          "version": "latest",
          "virtualMachineImageId": null
        },
        "nodeAgentSkuId": "batch.node.ubuntu 18.04",
        "osType": "linux",
        "verificationType": "unverified"
      }
    """
    new_pool = PoolAddParameter(
        id=POOL_ID,
        display_name=POOL_ID,
        
        vm_size=POOL_VM_SIZE,
        
#         virtual_machine_configuration=VirtualMachineConfiguration(
#             image_reference= ImageReference(
#                 publisher="microsoft-dsvm",
#                 offer="ubuntu-1804",
#                 sku="1804",
#                 version="latest"
#             ),
#             node_agent_sku_id="batch.node.ubuntu 18.04"),
        virtual_machine_configuration=VirtualMachineConfiguration(
            image_reference= ImageReference(
                publisher="microsoft-azure-batch",
                offer="ubuntu-server-container",
                sku="16-04-lts",
                version="latest"
            ),
            node_agent_sku_id="batch.node.ubuntu 16.04"),
        
        target_dedicated_nodes=POOL_NODE_COUNT, # we only used dedicated nodes
        
    )
    batch_service_client.pool.add(new_pool)

def create_job():
    pass

def create_task():
    """
    All Tasks should be idempotent as they may need to be retried due to a recovery operation.
    """
    pass

In [7]:
account_url = os.environ['BATCH_ACCOUNT_URL']

app_client_id = os.environ['APP_CLIENT_ID']
app_client_secret = os.environ['APP_CLIENT_SECRET']
app_tenant_id = os.environ['APP_TENANT_ID']

credentials = ServicePrincipalCredentials(
    client_id=app_client_id,
    secret=app_client_secret,
    tenant=app_tenant_id,
    resource="https://batch.core.windows.net/"
)

# if using the Batch quota system, use https://docs.microsoft.com/en-us/python/api/azure-batch/azure.batch.batch_auth.sharedkeycredentials?view=azure-python
# to authenticate instead of the service principal is also okay.

batch_client = BatchServiceClient(credentials=credentials, batch_url=account_url)

In [8]:
try:
    create_pool(batch_client, POOL_ID)
except BatchErrorException as e:
    print_batch_exception(e)
    raise

## Submitting a job

Job is what we have been referring to as Requests. Each shard corresponds to a Task.

The Azure Batch service sets these environment variables on the compute nodes:

- AZ_BATCH_JOB_ID

- AZ_BATCH_TASK_ID
- AZ_BATCH_TASK_DIR
- AZ_BATCH_TASK_WORKING_DIR - currently running task has read/write access to this directory

```
python -c 'import tensorflow as tf; print(tf.__version__); print(f'is gpu available {tf.test.is_gpu_available()}');'
```

In [44]:
job_id = 'test2'

In [45]:
# job id is the request id in the old API context

job = JobAddParameter(
    id=job_id,
    pool_info=PoolInformation(pool_id=POOL_ID),
)

batch_client.job.add(job)

## Submit tasks to the job (the shards)

After creating a user and logging into the node, 

`py37_tensorflow` conda environment is available, TF version is 2.3.1 (boto is 2.2.0), command to check GPU is `tf.config.list_physical_devices('GPU')`

```
>>> tf.config.list_physical_devices('GPU')
2021-01-12 05:54:31.745248: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 8bbe:00:00.0 name: Tesla V100-PCIE-16GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2021-01-12 05:54:31.745301: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-01-12 05:54:31.745452: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcublas.so.10'; dlerror: libcublas.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.1.163/linux/tbb/lib/intel64_lin/gcc4.7:/opt/intel/compilers_and_libraries_2018.1.163/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/
2021-01-12 05:54:31.745479: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-01-12 05:54:31.745498: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2021-01-12 05:54:31.745515: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2021-01-12 05:54:31.745531: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2021-01-12 05:54:31.745549: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2021-01-12 05:54:31.745560: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1753] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
```

GPU information:
```
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1
```

Okay, looks like (https://www.tensorflow.org/install/source#gpu) CUDA 11 is only supported by TF 2.4+. So let's look into containers...

In [47]:
command = """/bin/bash -c "conda activate py37_tensorflow && python -c 'import tensorflow as tf; print(tf.__version__); print(tf.test.is_gpu_available())'" """

task = TaskAddParameter(
        id='task_{}'.format(0),
        command_line=command
)

In [48]:
batch_client.task.add(job_id, task)

### Run container applications

https://docs.microsoft.com/en-us/azure/batch/batch-docker-container-workloads

## Monitoring a job

Optimization: remember which tasks have already Completed so that we do not repeatedly query for their status.