In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # default is ‘last_expr’

%load_ext autoreload
%autoreload 2

In [None]:
import azure.batch
azure.batch.__version__

In [None]:
import os

from azure.batch import BatchServiceClient
from azure.batch.models import *
from azure.common.credentials import ServicePrincipalCredentials

# Set up an instance of the batch processing API

We create one Azure Batch Pool for each instance of the batch processing API.

The limit for the number of Pools in our Batch account is 100.

## Step 1: Create an Azure Batch Pool

In [None]:
# MODIFY THIS CELL

# POOL_ID should start with the name of the API instance this pool will be used for

POOL_ID = 'zooniverse_0'
assert len(POOL_ID) <= 64, 'pool_id has more than 64 characters'

# choose the account in East US or South Central US
BATCH_ACCOUNT_URL = 'https://cameratraps.eastus.batch.azure.com'  

In [None]:
# secrets read from environment variables
REGISTRY_PASSWORD = os.environ['REGISTRY_PASSWORD']
STORAGE_ACCOUNT_KEY = os.environ['STORAGE_ACCOUNT_KEY']

# authenticate with Batch account using the service principle "camera-trap-async-api" in our AAD
APP_CLIENT_ID = os.environ['APP_CLIENT_ID']
APP_CLIENT_SECRET = os.environ['APP_CLIENT_SECRET']
APP_TENANT_ID = os.environ['APP_TENANT_ID']


# other configuration info

# Docker image
REGISTRY_SERVER = 'cameratracrsppftkje.azurecr.io'
REGISTRY_USERNAME = REGISTRY_SERVER.split('.')[0]

CONTAINER_IMAGE_NAME = 'cameratracrsppftkje.azurecr.io/tensorflow:1.14.0-gpu-py3' # login server/repository:tag

# storage
STORAGE_ACCOUNT_NAME = 'cameratrap'  # in the engineering subscription

# names of two containers supporting the API instances in the above storage account
STORAGE_CONTAINER_MODELS = 'models'
STORAGE_CONTAINER_API = 'batch-api'

# Azure Batch node pool VM type
POOL_VM_SIZE = 'Standard_NC6s_v3'  # https://docs.microsoft.com/en-us/azure/virtual-machines/ncv3-series

# auto-scale formula - can be set manually in Azure portal
# last statement makes sure that nodes aren't removed until their tasks are finished
# docs: https://docs.microsoft.com/en-us/azure/batch/batch-automatic-scaling

# MODIFY the "cappedPoolSize" if it should be other than 16 dedicated nodes
POOL_AUTO_SCALE_FORMULA = """
// In this formula, the pool size is adjusted based on the number of tasks in the queue. 
// Note that both comments and line breaks are acceptable in formula strings.

// Get pending tasks for the past 15 minutes.
$samples = $ActiveTasks.GetSamplePercent(TimeInterval_Minute * 15);

// If we have fewer than 70 percent data points, we use the last sample point, otherwise we use the maximum of last sample point and the history average.
$tasks = $samples < 70 ? max(0, $ActiveTasks.GetSample(1)) : 
max( $ActiveTasks.GetSample(1), avg($ActiveTasks.GetSample(TimeInterval_Minute * 15)));

// If number of pending tasks is not 0, set targetVM to pending tasks, otherwise set to 0, since there is usually long intervals between job submissions.
$targetVMs = $tasks > 0 ? $tasks : 0;

// The pool size is capped at 16, if target VM value is more than that, set it to 16.
cappedPoolSize = 16;
$TargetDedicatedNodes = max(0, min($targetVMs, cappedPoolSize));

// Set node deallocation mode - keep nodes active only until tasks finish
$NodeDeallocationOption = taskcompletion;
"""

In [None]:
def print_batch_exception(batch_exception):
    """
    Prints the contents of the specified Batch exception.
    """
    print('-------------------------------------------')
    print('Exception encountered:')
    if batch_exception.error and \
            batch_exception.error.message and \
            batch_exception.error.message.value:
        print(batch_exception.error.message.value)
        if batch_exception.error.values:
            print()
            for msg in batch_exception.error.values:
                print(f'{msg.key}:\t{msg.value}')
    print('-------------------------------------------')

def create_pool(batch_service_client, pool_id):
    """
    Create a pool with pool_id and the Docker image specified by constants in above cells
    """
    # we have to use VM images supporting GPU access *and* Docker
    # this VM image will run our custom container
    image_ref = ImageReference(
        publisher='microsoft-azure-batch',
        offer='ubuntu-server-container',
        sku='16-04-lts',
        version='latest'  # URN: microsoft-azure-batch:ubuntu-server-container:16-04-lts:1.1.0
        # The Azure Batch container image only accepts 'latest' version
    )

    # specify a container registry from which to pull the custom container
    # see the `batch_service` folder on instructions for building the container image
    container_registry = ContainerRegistry(
        registry_server=REGISTRY_SERVER,
        user_name=REGISTRY_USERNAME,
        password=REGISTRY_PASSWORD
    )

    container_conf = ContainerConfiguration(
        container_image_names = [CONTAINER_IMAGE_NAME],
        container_registries =[container_registry]
    )

    vm_config = VirtualMachineConfiguration(
        image_reference=image_ref,
        container_configuration=container_conf,
        node_agent_sku_id='batch.node.ubuntu 16.04'
    )

    # mount the `models` and the `batch-api` blob containers
    container_models = MountConfiguration(
        azure_blob_file_system_configuration=AzureBlobFileSystemConfiguration(
            account_name=STORAGE_ACCOUNT_NAME,
            container_name=STORAGE_CONTAINER_MODELS,
            relative_mount_path=STORAGE_CONTAINER_MODELS,  # use container name as relative path
            account_key=STORAGE_ACCOUNT_KEY,
            blobfuse_options='-o attr_timeout=240 -o entry_timeout=240 -o negative_timeout=120 -o allow_other'
        )
    )
    container_batch_api = MountConfiguration(
        azure_blob_file_system_configuration=AzureBlobFileSystemConfiguration(
            account_name=STORAGE_ACCOUNT_NAME,
            container_name=STORAGE_CONTAINER_API,
            relative_mount_path=STORAGE_CONTAINER_API,  # use container name as relative path
            account_key=STORAGE_ACCOUNT_KEY,
            # allow_other needs to be flagged - task running inside container needs to access this blob container
            blobfuse_options='-o attr_timeout=240 -o entry_timeout=240 -o negative_timeout=120 -o allow_other'
        )
    )

    new_pool = PoolAddParameter(
        id=POOL_ID,
        display_name=POOL_ID,

        vm_size=POOL_VM_SIZE,
        
        enable_auto_scale=True,
        auto_scale_formula=POOL_AUTO_SCALE_FORMULA,

        virtual_machine_configuration=vm_config,

        # default is 1; each task occupies the entire GPU so we can only run one task at a time on a node
        task_slots_per_node=1,

        mount_configuration=[container_models, container_batch_api],
    )
    batch_service_client.pool.add(new_pool)

In [None]:
credentials = ServicePrincipalCredentials(
    client_id=APP_CLIENT_ID,
    secret=APP_CLIENT_SECRET,
    tenant=APP_TENANT_ID,
    resource='https://batch.core.windows.net/'
)

# if using the Batch quota system, use https://docs.microsoft.com/en-us/python/api/azure-batch/azure.batch.batch_auth.sharedkeycredentials?view=azure-python
# to authenticate instead of the service principal is also okay.

batch_client = BatchServiceClient(credentials=credentials, batch_url=BATCH_ACCOUNT_URL)

In [None]:
%%time
# pool creation should run quickly

try:
    create_pool(batch_client, POOL_ID)
except BatchErrorException as e:
    print_batch_exception(e)
    raise

## Step 2: Upload the scoring script

Note that all instances share this scoring script!

In [None]:
# MODIFY THIS CELL

# path to the scoring script; modify if cwd is not `api_core`
path_scoring_script = 'batch_service/score.py'

# SAS with write permission for uploading output JSONs
sas_query_str = ''  # get a write-enabled SAS for the container below

output_container_url = f'https://cameratrap.blob.core.windows.net/batch-api{sas_query_str}'

In [None]:
# upload the scoring script to the container above; Batch Tasks will retrieve the script from there

output_container_client = ContainerClient.from_container_url(output_container_url)

with open(path_scoring_script, 'rb') as f:
    script_blob_client = output_container_client.upload_blob(name='scripts/score.py', data=f, overwrite=True)

## Useful CLI commands for using Docker images with Batch

List all Batch supported images with their "capabilities" (e.g. "DockerCompatible", "NvidiaTeslaDriverInstalled"):
```
az batch pool supported-images list
```
with the pool information provided in additional parameters.

Listing all versions of a SKU of image:
```
az vm image list --all --publisher microsoft-dsvm
```

You may need to accept the terms of an image:
```
az vm image list --all --publisher <publisher>
```
to find the URN for the image you want to use, followed by:

```
az vm image terms accept --urn <corresponding-urn>
```