### Launch Jobs for training

In [1]:
import os
import ads

from ads.jobs import DataScienceJob
from ads.jobs import ScriptRuntime
from ads.jobs import Job

from ads import set_auth

In [2]:
print(ads.__version__)

2.8.5


In [11]:
compartment_id = os.environ['NB_SESSION_COMPARTMENT_OCID']
project_id = os.environ['PROJECT_OCID']

set_auth(auth='resource_principal')

In [12]:
#
# Here all the definitions
#
LOG_GROUP_ID = "ocid1.loggroup.oc1.eu-frankfurt-1.amaaaaaangencdyazs4l4rzrzsarlej6mqlwlbz6bmnx4adwdlssveam2jaa"
LOG_ID = "ocid1.log.oc1.eu-frankfurt-1.amaaaaaangencdya47httqmxyiew5tkxa6l7gekev2ljpasixuhmp2fa3v5q"

NAMESPACE = "frqap2zhtzbe"
CONDA_BUCKET = "procco-envs"
# bucket with code to execute
SOURCE_BUCKET = "procco-sw"

CUSTOM_ENV_URI = f"oci://{CONDA_BUCKET}@{NAMESPACE}/conda_environments/gpu/prc_pytorch_gpu_/1.0/prc_pytorch_gpu_v1_0"
SOURCE_URI = f"oci://{SOURCE_BUCKET}@{NAMESPACE}/test1.tar.gz"

# the first to execute
RUN_ENTRYPOINT = "train.sh"

# SHAPE_NAME = "VM.Standard2.4"
# SHAPE_NAME = "VM.GPU2.1"
SHAPE_NAME = "VM.GPU.A10.1"
# in GB
STORAGE_SIZE = 2000

JOBS_NAME = "job_mnist_01"

In [13]:
# 1. Specify the Infrastructure requested
# VM Shape, logging
# network is taken from NB session

# you need to provide the OCID for LogGroup and Log
infrastructure = (
    DataScienceJob()
    .with_shape_name(SHAPE_NAME)
    .with_block_storage_size(STORAGE_SIZE)
    .with_log_group_id(LOG_GROUP_ID)
    .with_log_id(LOG_ID)
)

In [14]:
# specify the runtime and conda and env 
runtime = (
    ScriptRuntime()
    .with_source(SOURCE_URI)
    .with_custom_conda(CUSTOM_ENV_URI)
    .with_environment_variable(JOB_RUN_ENTRYPOINT=RUN_ENTRYPOINT)
)

In [15]:
# specify the JOB
job = (
    Job(name=JOBS_NAME)
    .with_infrastructure(infrastructure)
    .with_runtime(runtime)
)

In [16]:
# create the JOB
job.create()


kind: job
spec:
  id: ocid1.datasciencejob.oc1.eu-frankfurt-1.amaaaaaangencdya46azr77xid6lcj6kwdkwbxsb4uvfeyf2j6zwdovrxzpq
  infrastructure:
    kind: infrastructure
    spec:
      blockStorageSize: 2000
      compartmentId: ocid1.compartment.oc1..aaaaaaaag2cpni5qj6li5ny6ehuahhepbpveopobooayqfeudqygdtfe6h3a
      displayName: job_mnist_01
      jobInfrastructureType: ME_STANDALONE
      jobType: DEFAULT
      logGroupId: ocid1.loggroup.oc1.eu-frankfurt-1.amaaaaaangencdyazs4l4rzrzsarlej6mqlwlbz6bmnx4adwdlssveam2jaa
      logId: ocid1.log.oc1.eu-frankfurt-1.amaaaaaangencdya47httqmxyiew5tkxa6l7gekev2ljpasixuhmp2fa3v5q
      projectId: ocid1.datascienceproject.oc1.eu-frankfurt-1.amaaaaaangencdyarxbilkubzgqjom3vpr4ejxpp6xtw3blfdvuyhd6sggta
      shapeName: VM.GPU.A10.2
    type: dataScienceJob
  name: job_mnist_01
  runtime:
    kind: runtime
    spec:
      conda:
        type: published
        uri: oci://procco-envs@frqap2zhtzbe/conda_environments/gpu/prc_pytorch_gpu_/1.0/prc_pytorch_g

In [17]:
job_run = job.run()

In [18]:
job_run.watch()

Job OCID: ocid1.datasciencejob.oc1.eu-frankfurt-1.amaaaaaangencdya46azr77xid6lcj6kwdkwbxsb4uvfeyf2j6zwdovrxzpq
Job Run OCID: ocid1.datasciencejobrun.oc1.eu-frankfurt-1.amaaaaaangencdyax3zmt6of3yh36ev3qxbxo3vl4slessb6jzvxl67dpkca
2023-06-07 13:45:50 - Job Run ACCEPTED
2023-06-07 13:46:03 - Job Run ACCEPTED, Infrastructure provisioning.
2023-06-07 13:47:43 - Job Run ACCEPTED, Infrastructure provisioned.
2023-06-07 13:48:02 - Job Run ACCEPTED, Job run bootstrap starting.
2023-06-07 13:51:08 - Job Run ACCEPTED, Job run bootstrap complete. Artifact execution starting.
2023-06-07 13:51:23 - Job Run IN_PROGRESS, Job run artifact execution in progress.
2023-06-07 13:51:04 - Fontconfig error: Cannot load default config file: No such file: (null)
2023-06-07 13:51:07 - GPU available: True (cuda), used: True
2023-06-07 13:51:07 - HPU available: False, using: 0 HPUs
2023-06-07 13:51:07 - IPU available: False, using: 0 IPUs
2023-06-07 13:51:07 - TPU available: False, using: 0 TPU cores
100% 9912422/

kind: jobRun
spec:
  id: ocid1.datasciencejobrun.oc1.eu-frankfurt-1.amaaaaaangencdyax3zmt6of3yh36ev3qxbxo3vl4slessb6jzvxl67dpkca
  infrastructure:
    kind: infrastructure
    spec:
      blockStorageSize: 2000
      compartmentId: ocid1.compartment.oc1..aaaaaaaag2cpni5qj6li5ny6ehuahhepbpveopobooayqfeudqygdtfe6h3a
      displayName: job_mnist_01-run-2023-06-07-13:45.48
      jobInfrastructureType: ME_STANDALONE
      jobType: DEFAULT
      logGroupId: ocid1.loggroup.oc1.eu-frankfurt-1.amaaaaaangencdyazs4l4rzrzsarlej6mqlwlbz6bmnx4adwdlssveam2jaa
      logId: ocid1.log.oc1.eu-frankfurt-1.amaaaaaangencdya47httqmxyiew5tkxa6l7gekev2ljpasixuhmp2fa3v5q
      projectId: ocid1.datascienceproject.oc1.eu-frankfurt-1.amaaaaaangencdyarxbilkubzgqjom3vpr4ejxpp6xtw3blfdvuyhd6sggta
      shapeName: VM.GPU.A10.2
    type: dataScienceJob
  name: job_mnist_01-run-2023-06-07-13:45.48
  runtime:
    kind: runtime
    spec:
      conda:
        region: eu-frankfurt-1
        type: published
        uri: oci:

In [None]:
%%capture --no-stderr cap
job_run.watch()

with open('output1000.txt', 'w') as f:
    f.write(cap.stdout)