# Training Process
This run has an example of running a training job

1. Creating AI template and instance

In [3]:
import os.path
import shutil

from superai.meta_ai.ai import AITemplate, AI, TrainingOrchestrator
from superai.meta_ai.parameters import Config, HyperParameterSpec, TrainingParameters, ModelParameters
from superai.meta_ai.schema import Schema

In [2]:
if os.path.exists(".AISave"):
    shutil.rmtree(".AISave")

In [19]:
template = AITemplate(
    input_schema=Schema(),
    output_schema=Schema(),
    configuration=Config(),
    name="MnistTrainingTemplate",
    description="Template of Sample MNIST training",
    model_class="MnistModel",
    requirements=["tensorflow-gpu==2.3.0", "polyaxon"],
    artifacts={"run": "resources/runDir/run_this.sh"},
    code_path=["resources/runDir"],
)
ai = AI(
    ai_template=template,
    input_params=template.input_schema.parameters(),
    output_params=template.output_schema.parameters(),
    name="mnist_training",
    version=1,
    description="AI instance of sample MNIST training",
)

## Create and push training container
Use the following interface to create and push the training container.

> Note: Later, this interface will contain implementation to connect to meta-ai to create a training job.

In [4]:
ai._id = "DAAD6583-1FD7-4718-8B12-333701894FDB".lower()

hyperparams = HyperParameterSpec(epochs=10)
ai.training_deploy(
    orchestrator=TrainingOrchestrator.AWS_EKS,
    skip_build=False,
    enable_cuda=True,
    build_all_layers=True,
    training_parameters=TrainingParameters(hyperparameters=hyperparams)
)

Output()

# Starting training manually
To start training manually, in a separate terminal run, `polyaxon port-forward` to connect to Polyaxon API.

Then run the following cell
> Note: triggering training will be done from meta-ai in the future.

In [14]:
!cat polyaxonfile.yaml

version: 1.1
kind: component
tags: [examples, keras]

inputs:
- {name: conv1_size, type: int, value: 32, isOptional: true}
- {name: conv2_size, type: int, value: 64, isOptional: true}
- {name: dropout, type: float, value: 0.8, isOptional: true}
- {name: hidden1_size, type: int, value: 500, isOptional: true}
- {name: optimizer, type: str, value: adam, isOptional: true}
- {name: log_learning_rate, type: int, value: -3, isOptional: true}
- {name: epochs, type: int, value: 10, isOptional: true}

run:
  kind: job
  container:
    image: 185169359328.dkr.ecr.us-east-1.amazonaws.com/models/dev/daad6583-1fd7-4718-8b12-333701894fdb/mnist_training:1
#    workingDir: "{{ globals.artifacts_path }}/polyaxon-examples/in_cluster/keras/mnist"
    command: ["/opt/conda/envs/env/bin/superai", "ai", "method", "train"]
    imagePullPolicy: Always
    args: ["-p", "/home/model-server/",
           "-tp","/tmp",
           "-mp","/tmp",
           "-m","conv1_size={{ conv1_size }}",


In [None]:
!polyaxon run -f resources / polyaxonfile.yaml -u -l

Creating a new run...
[32mA new run `3d56bfb861634b8e92cd11433e23b37a` was created[0m
You can view this run on Polyaxon UI: http://localhost:8000/ui/default/dev/runs/3d56bfb861634b8e92cd11433e23b37a/
[32mArtifacts uploaded[0m
[32mRun is approved[0m
[32mStarting logs for run: <Name: None> - <uuid: 3d56bfb861634b8e92cd11433e23b37a>[0m
[33mrunning[0m
[37m2022-02-18 17:51:29.912739+01:00[0m | Reading configs from /opt/conda/envs/env/lib/python3.7/site-packages/superai/settings.yaml
[37m2022-02-18 17:51:29.912767+01:00[0m | Available envs:
[37m2022-02-18 17:51:29.912770+01:00[0m | - local
[37m2022-02-18 17:51:29.912773+01:00[0m | - dev
[37m2022-02-18 17:51:29.912775+01:00[0m | - sandbox
[37m2022-02-18 17:51:29.912777+01:00[0m | - stg
[37m2022-02-18 17:51:29.912779+01:00[0m | - prod
[37m2022-02-18 17:51:29.912781+01:00[0m | Reading configs from /opt/conda/envs/env/lib/python3.7/site-packages/superai/settings.yaml
[37m2022-02-18 17:51:29.912783+01:00[0m | [02/18/22

[37m2022-02-18 17:51:34.033765+01:00[0m | Epoch 1/10
[37m2022-02-18 17:51:34.033863+01:00[0m | 2022-02-18 16:51:34.033782: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session started.
[37m2022-02-18 17:51:34.033904+01:00[0m | 2022-02-18 16:51:34.033844: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1441] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI could not be loaded or symbol could not be found.
[37m2022-02-18 17:51:34.167184+01:00[0m |                              hon3.7/site-packages/tensorflow/
[37m2022-02-18 17:51:34.167200+01:00[0m |                              python/ops/summary_ops_v2.py:127
[37m2022-02-18 17:51:34.167204+01:00[0m |                              7: stop (from tensorflow.python.
[37m2022-02-18 17:51:34.167208+01:00[0m |                              eager.profiler) is deprecated
[37m2022-02-18 17:51:34.167211+01:00[0m |                             

# Accessing artifacts stored in S3
We have set the default artifact store in polyaxon as a S3 bucket in  (`s3://canotic-ai/polyaxon-storage/`)[https://s3.console.aws.amazon.com/s3/buckets/canotic-ai?prefix=polyaxon-storage/&region=us-east-1]

To access the artifacts, we have added a very simple example which prints the content of a text file stored in a sample folder.

In [3]:
!cat resources / artifacts.yaml

version: 1.1
kind: component
tags: [test]

run:
  init:
    - artifacts:
        dirs: [ "test-storage" ]
  kind: job
  container:
    image: ubuntu:latest
    workingDir: "{{ globals.artifacts_path }}/test-storage"
    command: ["cat", "random_text.txt"]
    imagePullPolicy: IfNotPresent


Run the job

In [4]:
!polyaxon run -f resources / artifacts.yaml -u -l

Creating a new run...
[32mA new run `56e14c5646de46b8a15db42dddae3a0b` was created[0m
You can view this run on Polyaxon UI: http://localhost:8000/ui/default/dev/runs/56e14c5646de46b8a15db42dddae3a0b/
[32mArtifacts uploaded[0m
[32mRun is approved[0m
[32mStarting logs for run: <Name: None> - <uuid: 56e14c5646de46b8a15db42dddae3a0b>[0m
[33mrunning[0m
[37m2022-02-22 13:50:04.610585+01:00[0m | This is some random text for test. Test Text.
[32msucceeded[0m


## Deploying a training instance using SDK functionalities

In [13]:
if os.path.exists(".AISave"):
    shutil.rmtree(".AISave")

### Adding an app ID to model
The following AI instance can be associated with an app. The `app_id` here is associated with ankit@super.ai, so make sure you use an app under your own user owner ID.

In [14]:
template = AITemplate(
    input_schema=Schema(),
    output_schema=Schema(),
    configuration=Config(),
    name="MnistTrainingTemplate",
    description="Template of Sample MNIST training",
    model_class="MnistModel",
    requirements=["tensorflow-gpu==2.3.0", "polyaxon"],
    artifacts={"run": "resources/runDir/run_this.sh"},
    code_path=["resources/runDir"],
)
ai = AI(
    ai_template=template,
    input_params=template.input_schema.parameters(),
    output_params=template.output_schema.parameters(),
    name="mnist_training",
    version=1,
    description="AI instance of sample MNIST training",
    app_id="c1bed1ac-6418-40e3-9ce0-15a17d1bc38c",
)

In [5]:
ai.push()

'ca7ef522-e115-4c32-ab91-308d218a111b'

In [6]:
ai.training_deploy(
    orchestrator=TrainingOrchestrator.AWS_EKS,
    training_parameters=TrainingParameters(
        model_save_path="/tmp", training_data="/tmp",
        hyperparameters=HyperParameterSpec(
            trainable=True,
            optimizer="adam",
            log_learning_rate=-3,
            epochs=10
        ),
        model_parameter=ModelParameters(
            conv1_size=32,
            conv2_size=64,
            hidden1_size=500,
            dropout=0.8
        ),
    )
)

Output()

This should start a training job. We will be adding features to monitor the training progress soon.

In [4]:
if os.path.exists(".AISave/intel_image_classification"):
    shutil.rmtree(".AISave/intel_image_classification")

In [5]:
template = AITemplate(
    input_schema=Schema(),
    output_schema=Schema(),
    configuration=Config(),
    name="IntelImageClassificationTemplate",
    description="Template of Sample MNIST training",
    model_class="IntelImageClassification",
    requirements=[
        "tensorflow-gpu==2.3.0", 
        "polyaxon[s3]", 
        "scikit-learn", 
        "tqdm", 
        "opencv-python-headless", 
        "aiobotocore==2.2.0", 
        "aioitertools==0.10.0", 
        "botocore==1.24.21"
    ],
    artifacts={"run": "resources/runDir/run_this.sh"},
    code_path=["resources/runDir"],
)
ai = AI(
    ai_template=template,
    input_params=template.input_schema.parameters(),
    output_params=template.output_schema.parameters(),
    name="intel_image_classification",
    version=1,
    description="AI instance of sample MNIST training",
    app_id="c1bed1ac-6418-40e3-9ce0-15a17d1bc38c",
)

In [6]:
ai.push()

'5d03ae31-5b69-4cc1-818b-ec20ae55b8b7'

In [9]:
ai.training_deploy(
    orchestrator=TrainingOrchestrator.AWS_EKS,
    training_parameters=TrainingParameters(
        model_save_path=f"{TrainingParameters.output_folder_template}/output-model", 
        training_data="test/intel-image.zip",
        hyperparameters=HyperParameterSpec(
            trainable=True,
            batch_size=128,
            validation_split=0.2,
            epochs=10
        ),
        model_parameter=ModelParameters(
            conv1_size=32,
            conv2_size=32,
            dense_size=128
        ),
    ),
    enable_cuda=True,
)

Output()