### OCI Data Science - ML Pipelines sample notebook with ADS

This sample notebook demonstrates how to use ADS to create a pipeline with 3 steps:
 * the first step sets a paramter for the next step
 * the second step creates a data table with the number of rows defined in the first step. generated data is random. the data table is then saved to object storage and the name and location of the data is saved as a paramter for the next step.
 * the third step reads the location of the data and then the data table from the previous step.
 
 Note: the notebook creates the pipeline and runs it, however the code itself for the steps is located in the 3 zip files that should be attached to this notebook.
 
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
import oci
import ads
import os
from os import environ
from ads.catalog.project import ProjectCatalog

In [None]:
# make sure you are using ADS version 2.8 or above
print(ads.__version__)

In [None]:
if 'NB_SESSION_COMPARTMENT_OCID' in os.environ:
    # using the notebook's compartment if running on OCI Data Science Notebook Session
    compartment_id = os.environ['NB_SESSION_COMPARTMENT_OCID']
else:
    # set the compartment OCID if you are working locally
    compartment_id = '<YOUR_COMPARTMENT_OCID>'

In [None]:
# this will work whether working on a local machine (using API key) or in OCI Notebook Session (using Resource Principal)
if "OCI_RESOURCE_PRINCIPAL_VERSION" in os.environ:
    # Use resource principal
    print("using Resource Principal for auth")
    ads.set_auth(auth="resource_principal")
else:
    # Use api_key with config file
    print("using API key for auth")
    ads.set_auth(auth="api_key")    

In [None]:
# make sure there is access to the project and compartment
pc = ProjectCatalog(compartment_id=compartment_id)
pc.list_projects()

In [None]:
project_id = "<YOUR_PROJECT_OCID>"

In [None]:
log_group_id = "<YOUR_LOGGROUP_OCID>"

In [None]:
from random import randrange
pipeline_name = f"ads_pipeline_sample_{randrange(1000,9999)}"

## create a new pipeline with ADS

In [None]:
from ads.pipeline.ads_pipeline_step import PipelineStep
from ads.pipeline.ads_pipeline import Pipeline
from ads.pipeline import CustomScriptStep
from ads.jobs import ScriptRuntime

infrastructure = (
    CustomScriptStep()
    .with_block_storage_size(50)
    .with_shape_name("VM.Standard2.4")
)

step_one = (
    PipelineStep("step1")
    .with_description("Pass data by value")
    .with_infrastructure(infrastructure)
    .with_runtime(
        ScriptRuntime()
        .with_source("mlpipeline_step1.zip")
        .with_service_conda("onnx110_p37_cpu_v1")
        .with_environment_variable(
            PIPELINE_STEP_RUN_ENTRYPOINT="mlpipeline_step1.py"
        )
    )
)

step_two = (
    PipelineStep("step2")
    .with_description("pass data by reference")
    .with_infrastructure(infrastructure)
    .with_runtime(
        ScriptRuntime()
        .with_source("mlpipeline_step2.zip")
        .with_service_conda("onnx110_p37_cpu_v1")
        .with_environment_variable(
            PIPELINE_STEP_RUN_ENTRYPOINT="mlpipeline_step2.py"
        )
    )
)

step_three = (
    PipelineStep("step3")
    .with_description("read data by reference")
    .with_infrastructure(infrastructure)
    .with_runtime(
        ScriptRuntime()
        .with_source("mlpipeline_step3.zip")
        .with_service_conda("onnx110_p37_cpu_v1")
        .with_environment_variable(
            PIPELINE_STEP_RUN_ENTRYPOINT="mlpipeline_step3.py"
        )
    )
)

In [None]:
pipeline = (
    Pipeline(pipeline_name)
    .with_compartment_id(compartment_id)
    .with_project_id(project_id)
    .with_log_group_id(log_group_id)  # if you define the LogGroupID but not the LogID, logs will be created automatically in the specified LogGroup
    .with_step_details([step_one, step_two, step_three])
    .with_dag(["step1 >> step2 >> step3"])  # this defines the dependencies between the steps, creating the DAG.                                                     
)                                                   # Use () to define multiple step dependencies, like (step1, step2) >> step3 to run step1 and step2 in parallel and step3 when they both finish

In [None]:
# view the pipeline graph visually, make sure you have the step dependencies defined correctly before creating the pipeline
pipeline.show()

In [None]:
#create the pipeline in the OCI Data Science service. ADS will take care of uploading all the artifacts 
pipeline.create()

In [None]:
# you can export the pipeline to a YAML file and later import it back
pipeline.to_yaml("my_pipeline.yaml")

## Run the pipeline

In [None]:
pipeline_run_name = f"pipeline-run-{randrange(1000,9999)}"

In [None]:
# uncomment the next line and set to the OCI Object Storage location to use for passing data. Make sure you have proper permissions.
#data_location = "<YOUR_OBJECT_STORAGE_LOCATION>"  # use: 'oci://<bucket>@<workspace>/'

In [None]:
# create the pipeline run
pipeline_run = pipeline.run(
    configuration_override_details = {
    "type": "DEFAULT",
    "environment_variables": {"DATA_LOCATION": data_location}
    }
)

In [None]:
print(pipeline_run.status)

In [None]:
# View the pipeline run graph with status of each step
pipeline_run.show()

# you can keep watching the status updates using the following command
#pipeline_run.show(wait=True)

## view logs

In [None]:
pipeline_run.watch() # by default it watches custom logs for all steps
#pipeline_run.watch("step1","step2") # watches custom logs for "step1" and "step2"

## View the pipeline run form the OCI console UI

In [None]:
print("Ctrl-Click the hyperlink to open the pipeline run page in the OCI console UI")
print("https://cloud.oracle.com/data-science/pipeline-runs/{}".format(pipeline_run.id))

## Delete the pipeline

In [None]:
#delete the pipeline when done
pipeline.delete(delete_related_pipeline_runs=True, delete_related_job_runs=True)  # delete also all the pipeline runs and job runs in the pipeline, otherwise those have to be deleted before deleting the pipeline