# Download data into WMLA

This notebook shows how to create and submit a simple training script that is responsible for moving data from external storage, in this case COS, into the WMLA storage to be used by future training runs. The goal is to read the data only once and store it locally, then re-use it multiple times in training jobs.

# 0. Imports

In [40]:
import project_lib

import json
import os
from datetime import datetime
import time

from ibm_watson_machine_learning import APIClient

wml_credentials = {
    'token': os.environ['USER_ACCESS_TOKEN'],
    'url': os.environ['RUNTIME_ENV_APSX_URL'],
    'version': '4.0',
    'instance_id': 'openshift'
}

# 1. Connect to WML

In [41]:
client = APIClient(wml_credentials)

# we will run everything as project training jobs in the current project:
project_id = os.environ['PROJECT_ID']
client.set.default_project(project_id)

'SUCCESS'

# 3. Write script to read data and store it into WMLA

## 3.1 Write the script

In [42]:
%%writefile download.sh

#!/bin/bash

cd $DATA_DIR
mkdir DeepLIIF_Datasets
cd DeepLIIF_Datasets
wget https://zenodo.org/record/4751737/files/DeepLIIF_Training_Set.zip
wget https://zenodo.org/record/4751737/files/DeepLIIF_Validation_Set.zip
wget https://zenodo.org/record/4751737/files/DeepLIIF_Testing_Set.zip

unzip DeepLIIF_Training_Set.zip
unzip DeepLIIF_Validation_Set.zip
unzip DeepLIIF_Testing_Set.zip
rm -rf DeepLIIF_*.zip

mv DeepLIIF_Training_Set train
mv DeepLIIF_Validation_Set val
mv DeepLIIF_Testing_Set test

Overwriting download.sh


In [26]:
%%writefile main.py
"""
Utility script to be run in a training job to download files from COS into the WMLA storage.
Your code folder (submitted as .zip file to WMLA) must contain both this script, and a JSON file
cos_credentials.json containing an access key, secret key, bucket name,
and the correct public endpoint to connect to this bucket.
"""
import os

# STEP 0 - IDENTIFY WHERE THE DATA WILL BE STORED IN WML-A
DATA_DIR = os.environ['DATA_DIR']
RESULT_DIR = os.environ["RESULT_DIR"]
print(f"Data will be stored in $DATA_DIR {DATA_DIR}")
print("Current content of this folder is:")
print(os.listdir(DATA_DIR))

# os.system(f'mv download.sh {DATA_DIR}/download.sh')
# os.system(f'cd {DATA_DIR}; bash download.sh; rm -rf download.sh')
    
# STEP 5 - LET WMLA KNOW WE SUCCEEDED BY SAVING IN THE /model FOLDER
os.makedirs(os.path.join(RESULT_DIR, 'model'), exist_ok=True)
with open(os.path.join(RESULT_DIR, 'model', 'done.txt'), 'w') as f:
    f.write('Done.')

Overwriting main.py


## 3.3 Store the script as training code

Now that we have COS credentials in a JSON file, and a python script that reads these credentials and downloads the content of a COS bucket into the WMLA `$DATA_DIR` directory, we can store this script as a model definition:

In [43]:
!zip download_job.zip download.sh

updating: download.sh (deflated 63%)


In [50]:
meta_props = {
    client.model_definitions.ConfigurationMetaNames.NAME: 'Download data for DeepLiif (data load script)',
    client.model_definitions.ConfigurationMetaNames.PLATFORM: {'name': 'python', 'versions': ['3.7']},
    client.model_definitions.ConfigurationMetaNames.VERSION: '1',
    client.model_definitions.ConfigurationMetaNames.COMMAND: 'bash download.sh'
}

model_def_details = client.model_definitions.store('./download_job.zip', meta_props)
model_def_id = client.model_definitions.get_id(model_def_details)
print(model_def_id)

94715e54-aeb0-4067-bdd6-939af2e51cd7


# 4. Submit the job

In [51]:
meta_props = {
    client.training.ConfigurationMetaNames.NAME: 'Simple training run',
    client.training.ConfigurationMetaNames.DESCRIPTION: '',
    client.training.ConfigurationMetaNames.TRAINING_DATA_REFERENCES: [
        {
          "name": "training_input_data",
          "type": "fs",
          "connection": {},
          "location": {
            "path": "wmla-data"
          },
        }
    ],
    client.training.ConfigurationMetaNames.TRAINING_RESULTS_REFERENCE: {
        "location": {
            "path": f"/projects/{project_id}/assets/trainings"
        },
        "type": "fs"
    },
    client.training.ConfigurationMetaNames.MODEL_DEFINITION: {
        'id': model_def_id,
        "hardware_spec": {
          "name": "v100",
          "nodes": 1
        },
        "software_spec": {
            "name": "pytorch-onnx_1.7-py3.7"
        },
        "parameters": {
            "name": "my model" # this is mandatory, doesn't need to match the name of the model def
        }
    }   
}

training_details = client.training.run(meta_props, asynchronous=True)
training_id = client.training.get_id(training_details)
print(training_id)

13ac0862-0a5b-4819-9a5c-946994e68827


## 4.1 Monitor job

In [52]:
status = client.training.get_status(training_id)
while status['state'] in ['pending', 'running']:
    status = client.training.get_status(training_id)
    print(datetime.now().strftime("%H:%M:%S"), status)
    time.sleep(10)

02:53:02 {'state': 'pending'}
02:53:13 {'state': 'pending'}
02:53:23 {'state': 'pending'}
02:53:33 {'completed_at': '2021-11-26T02:53:26.861Z', 'failure': {'trace': 'na', 'errors': [{'code': 'unknown_job_execution_error', 'message': 'For input string: "default"', 'more_info': 'http://watson-ml-api.mybluemix.net/'}]}, 'state': 'failed'}


## 4.2 Get logs

In [53]:
import requests

host = wml_credentials['url']
headers = client._get_headers()  # we use the same headers as the WML client to avoid re-authenticating

In [54]:
r = requests.get(f"{host}/v2/asset_files/trainings/{training_id}/",
                 params={'project_id': project_id},
                 # params={'space_id': space_id},
                 headers=headers,
                 verify=False
                )
r.json()

{'resources': [{'path': 'trainings/13ac0862-0a5b-4819-9a5c-946994e68827/training-status.json',
   'etag': 'W/"89f-17d5a297956"',
   'size': 2207,
   'last_modifed': 'Fri, 26 Nov 2021 02:53:27 GMT',
   'last_modified': 'Fri, 26 Nov 2021 02:53:27 GMT',
   'type': 'file',
   'mime_type': 'application/json'}]}

In [25]:
host

In [55]:
r = requests.get(f"{host}/v2/asset_files/trainings/{training_id}/training-status.json",
#                  f'{host}/v2/projects/93b9f3ae-f3a7-45c2-aebc-1f146a8c1c21/assets/trainings/32fc3e2b-d2ef-4e08-b488-251d8fb536ea/logs',   
                 params={'project_id': project_id},
                 # params={'space_id': space_id},
                 headers=headers,
                 verify=False
                )
print(r.text)

{
  "metadata": {
    "created_at": "2021-11-26T02:53:00.305Z",
    "description": "",
    "guid": "13ac0862-0a5b-4819-9a5c-946994e68827",
    "id": "13ac0862-0a5b-4819-9a5c-946994e68827",
    "modified_at": "2021-11-26T02:53:27.056Z",
    "name": "Simple training run",
    "project_id": "93b9f3ae-f3a7-45c2-aebc-1f146a8c1c21"
  },
  "entity": {
    "description": "",
    "model_definition": {
      "hardware_spec": {
        "name": "v100"
      },
      "id": "94715e54-aeb0-4067-bdd6-939af2e51cd7",
      "parameters": {
        "name": "my model"
      },
      "software_spec": {
        "name": "pytorch-onnx_1.7-py3.7"
      }
    },
    "name": "Simple training run",
    "project_id": "93b9f3ae-f3a7-45c2-aebc-1f146a8c1c21",
    "results_reference": {
      "location": {
        "path": "/projects/93b9f3ae-f3a7-45c2-aebc-1f146a8c1c21/assets/trainings",
        "model": "/projects/93b9f3ae-f3a7-45c2-aebc-1f146a8c1c21/assets/trainings/13ac0862-0a5b-4819-9a5c-946994e68827/data/model",
 