# Mnist on cluster

1. load the workflow from the datascience team

2. build the images to the registry

4. start the agents, start the workflow


In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import os
sys.path.insert(0,'..')

import scanflow

from scanflow.setup import Setup, Executor, Workflow
from scanflow.special import Tracker, Checker, Improver, Planner
from scanflow.deploy import Deploy


# App folder
base_path = os.path.dirname(os.getcwd())
app_dir = os.path.join(base_path, "examples/demo_mnist/data-science")


executors = [Executor(name='predictor', 
                      file='predictor-batch.py',
                      parameters={'input_path': './mnist_sample/test_images.npy',
                                 'model_name':'mnist_cnn', 'model_version':13},
                      requirements='req_predictor.txt')           
            ]


workflow1 = Workflow(name='mnist', 
                     executors=executors,
                     tracker=Tracker(mode='online'),
                     checker=Checker(mode='online'),
                     improver=Improver(mode='online'),
                     planner=Planner(mode='online'))
    
    
## setup for the workflow
setup = Setup(app_dir, workflows=[workflow1],
                             verbose=False)

# ## deploy for the operations
deployer = Deploy(setup, 
                  backend="argo", 
                  namespace="testagent",
                  registry="172.30.0.49:5000",
                  k8sconfigdir="/gpfs/bsc_home/xpliu/.kube/config",
                  verbose=True)

# ## backend argo
argo = deployer.get_backend()
                  

09-Apr-21 11:07:13 -  INFO - [Tracker] Port 8002 is in use by Tracker.
09-Apr-21 11:07:13 -  INFO - [Checker] Port 8004 is set successfully.
09-Apr-21 11:07:13 -  INFO - [Improver] Port 8006 is set successfully.
09-Apr-21 11:07:13 -  INFO - [Planner] Port 8007 is set successfully.
09-Apr-21 11:07:14 -  INFO - [++]Creating namespace "testagent"
09-Apr-21 11:07:14 -  ERROR - create_namespace error
09-Apr-21 11:07:14 -  INFO - [++]Creating Role for 'default service account'
09-Apr-21 11:07:14 -  ERROR - create_rolebinding error


In [2]:
# argo.stop_agents()
# argo.build_workflows()
argo.start_agents()

09-Apr-21 11:02:46 -  INFO - [++] Starting workflow agents: [mnist].
09-Apr-21 11:02:46 -  INFO - [+] Starting tracker: [tracker-mnist].
09-Apr-21 11:02:46 -  INFO - [+] Create tracker PV
09-Apr-21 11:02:46 -  INFO - [+] Create tracker PVC
09-Apr-21 11:02:46 -  INFO - [+] Create tracker Deployment
09-Apr-21 11:02:46 -  INFO - [+] Create tracker Service
09-Apr-21 11:02:46 -  INFO - [+] Starting tracker agent: [tracker-agent-mnist].
09-Apr-21 11:02:46 -  INFO - [+] Create tracker agent PV
09-Apr-21 11:02:46 -  INFO - [+] Create tracker agent PVC
09-Apr-21 11:02:46 -  INFO - [+] Create tracker agent Service
09-Apr-21 11:02:47 -  INFO - [+] Create tracker Deployment
09-Apr-21 11:02:47 -  INFO - [+] Starting checker: [checker-mnist].
09-Apr-21 11:02:47 -  INFO - [+] Starting checker agent: [checker-agent-mnist].
09-Apr-21 11:02:47 -  INFO - [+] Create checker agent PV
09-Apr-21 11:02:47 -  INFO - [+] Create checker agent PVC
09-Apr-21 11:02:47 -  INFO - [+] Create checker agent Service
09-A

In [4]:
argo.run_workflows()

09-Apr-21 11:14:28 -  INFO - [++] Running workflow: [mnist].
09-Apr-21 11:14:28 -  INFO - [+] Create mnist PV
09-Apr-21 11:14:28 -  ERROR - create_pv error
09-Apr-21 11:14:28 -  INFO - [+] Create mnist PVC
09-Apr-21 11:14:28 -  ERROR - create_pvc error
09-Apr-21 11:14:28 -  INFO - [+] Running workflow: [mnist:predictor].
09-Apr-21 11:14:28 -  INFO - Found local kubernetes config. Initialized with kube_config.
09-Apr-21 11:14:28 -  INFO - Checking workflow name/generatedName runpy-
09-Apr-21 11:14:28 -  INFO - Submitting workflow to Argo
09-Apr-21 11:14:28 -  INFO - Workflow runpy-qnddw has been submitted in "testagent" namespace!
09-Apr-21 11:14:28 -  INFO - [+] Workflow: [mnist] was run successfully.


OrderedDict([('apiVersion', 'argoproj.io/v1alpha1'), ('kind', 'Workflow'), ('metadata', {'generateName': 'runpy-'}), ('spec', {'entrypoint': 'runpy', 'volumes': [OrderedDict([('name', 'apppath'), ('persistentVolumeClaim', {'claimName': 'mnist'})]), OrderedDict([('name', 'mlflowpath'), ('persistentVolumeClaim', {'claimName': 'tracker-mnist'})])], 'templates': [OrderedDict([('name', 'runpy'), ('dag', {'tasks': [OrderedDict([('name', 'predictor'), ('template', 'predictor')])]})]), OrderedDict([('name', 'predictor'), ('container', OrderedDict([('image', '172.30.0.49:5000/predictor'), ('command', None), ('env', [{'name': 'MLFLOW_TRACKING_URI', 'value': 'http://tracker-mnist:8002'}]), ('volumeMounts', [OrderedDict([('name', 'apppath'), ('mountPath', '/app')]), OrderedDict([('name', 'mlflowpath'), ('mountPath', '/mlflow')])])])), ('volumes', [])])]})])


In [4]:
argo.clean_environment()

09-Apr-21 10:59:38 -  INFO - [++] Stopping workflow agents: [mnist].
09-Apr-21 10:59:38 -  INFO - [++] Stopping checker: [checker-mnist].
09-Apr-21 10:59:38 -  ERROR - delete_deployment error
09-Apr-21 10:59:38 -  ERROR - delete_service error
09-Apr-21 10:59:38 -  ERROR - delete_pvc error
09-Apr-21 10:59:38 -  ERROR - delete_pv error
09-Apr-21 10:59:38 -  INFO - [++] Stopping checker agent: [checker-agent-mnist].
09-Apr-21 10:59:38 -  INFO - [++] Stopping improver agent: [improver-agent-mnist].
09-Apr-21 10:59:38 -  INFO - [++] Stopping planner agent: [planner-agent-mnist].
09-Apr-21 10:59:38 -  INFO - [++] Stopping tracker agent: [tracker-agent-mnist].
09-Apr-21 10:59:38 -  INFO - [++] Stopping tracker: [tracker-mnist].
09-Apr-21 10:59:38 -  INFO - [+] Workflow Agents: [mnist] agents were deleted successfully.
09-Apr-21 10:59:38 -  INFO - [++]Delete rolebinding default-admin
09-Apr-21 10:59:38 -  INFO - [++]Delete namespace "testagent"
