# Employee attrition sample using ML Pipelines

### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
import oci
import ads
import os
from os import environ
from ads.catalog.project import ProjectCatalog

#### Make sure you are using ADS version 2.9.0 or above

In [None]:
print(ads.__version__)

In [None]:
if 'NB_SESSION_COMPARTMENT_OCID' in os.environ:
    # using the notebook's compartment if running on OCI Data Science Notebook Session
    compartment_id = os.environ['NB_SESSION_COMPARTMENT_OCID']
else:
    # set the compartment OCID if you are working locally
    compartment_id = '<YOUR_COMPARTMENT_OCID>'
print('compartment OCID = ', compartment_id)

In [None]:
# this will work whether working on a local machine (using API key) or in OCI Notebook Session (using Resource Principal)
if "OCI_RESOURCE_PRINCIPAL_VERSION" in os.environ:
    # Use resource principal
    print("using Resource Principal for auth")
    ads.set_auth(auth="resource_principal")
else:
    # Use api_key with config file
    print("using API key for auth")
    ads.set_auth(auth="api_key")    

In [None]:
# make sure there is access to the project and compartment
pc = ProjectCatalog(compartment_id=compartment_id)
pc.list_projects()

Fill in your resources details:

In [None]:
project_id = '<YOUR_PROJECT_ID>'

In [None]:
log_group_id = "<YOUR_LOG_GROUP_ID>"

In [None]:
from random import randrange
pipeline_name = f"pipeline_sample_employee-attrition-{randrange(1000,9999)}"

In [None]:
from ads.pipeline.ads_pipeline_step import PipelineStep
from ads.pipeline.ads_pipeline import Pipeline
from ads.pipeline import CustomScriptStep
from ads.jobs import ScriptRuntime

infrastructure = (
    CustomScriptStep()
    .with_block_storage_size(50)
    .with_shape_name("VM.Standard2.4")
)

step_data_processing = (
    PipelineStep("data_processing")
    .with_description("Import data, feature engineering, train-test split")
    .with_infrastructure(infrastructure)
    .with_maximum_runtime_in_minutes(30)
    .with_runtime(
        ScriptRuntime()
        .with_source("employee-attr-dataproc.zip")
        .with_service_conda("pypgx2340_p38_cpu_v1")
        .with_environment_variable(PIPELINE_STEP_RUN_ENTRYPOINT="employee-attr-dataproc.py")
    )
)

step_train_logistic_regression = (
    PipelineStep("train_logistic_regression")
    .with_description("Train a Logistic Regression model and save to the model catalog with its AUC score")
    .with_infrastructure(infrastructure)
    .with_maximum_runtime_in_minutes(120)
    .with_runtime(
        ScriptRuntime()
        .with_source("employee-attr-train-lr.zip")
        .with_service_conda("pypgx2340_p38_cpu_v1")
        .with_environment_variable(PIPELINE_STEP_RUN_ENTRYPOINT="employee-attr-train-lr.py")
    )
)

step_train_random_forest = (
    PipelineStep("train_random_forest")
    .with_description("Train a Random Forest model and save to the model catalog with its AUC score")
    .with_infrastructure(infrastructure)
    .with_maximum_runtime_in_minutes(120)
    .with_runtime(
        ScriptRuntime()
        .with_source("employee-attr-train-rf.zip")
        .with_service_conda("pypgx2340_p38_cpu_v1")
        .with_environment_variable(PIPELINE_STEP_RUN_ENTRYPOINT="employee-attr-train-rf.py")
    )
)

step_train_xgboost = (
    PipelineStep("train_xgboost")
    .with_description("Train a model with XGBoost and save to the model catalog with its AUC score")
    .with_infrastructure(infrastructure)
    .with_maximum_runtime_in_minutes(120)
    .with_runtime(
        ScriptRuntime()
        .with_source("employee-attr-train-xgb.zip")
        .with_service_conda("pypgx2340_p38_cpu_v1")
        .with_environment_variable(PIPELINE_STEP_RUN_ENTRYPOINT="employee-attr-train-xgb.py")
    )
)

step_evaluate_and_deploy = (
    PipelineStep("evaluate_and_deploy")
    .with_description("Find the best model by their AUC score and deploy")
    .with_infrastructure(infrastructure)
    .with_maximum_runtime_in_minutes(30)
    .with_runtime(
        ScriptRuntime()
        .with_source("employee-attr-eval-deploy.zip")
        .with_service_conda("pypgx2340_p38_cpu_v1")
        .with_environment_variable(PIPELINE_STEP_RUN_ENTRYPOINT="employee-attr-eval-deploy.py")
    )
)

pipeline = (
    Pipeline(pipeline_name)
    .with_compartment_id(compartment_id)
    .with_project_id(project_id)
    .with_log_group_id(log_group_id)  # if you define the LogGroupID but not the LogID, logs will be created automatically in the specified LogGroup
    .with_freeform_tags({"pipeline-sample":"employee-attrition-sample"})
    .with_step_details([step_data_processing, step_train_logistic_regression, step_train_random_forest, step_train_xgboost, step_evaluate_and_deploy])
    .with_dag(["data_processing >> (train_logistic_regression, train_random_forest, train_xgboost) >> evaluate_and_deploy"])
)

In [None]:
# Create the pipeline
pipeline.create()

In [None]:
# visualize the pipeline
pipeline.show()

## Run the pipeline

In [None]:
# set the configuration and the environment variables for the run
pipeline_run_name = f"pipeline-run-{randrange(1000,9999)}"

In [None]:
# Uncomment the next line and set to the OCI Object storage bucket to use for transferring data between the steps. Make sure permissions are properly set.
#data_location = "<YOUR_OBJECT_STORAGE_BUCKET>"  # use: 'oci://<bucket>@<workspace>/'

In [None]:
pipeline_run = pipeline.run(
    display_name=pipeline_run_name,
    configuration_override_details={
        "type": "DEFAULT",
        "environment_variables": {
            "DATA_LOCATION": data_location,     # provide the data location to the run
            "SKIP_MODEL_DEPLOY": "True"         # change to "False" to deploy the best model
        }
    }
)

In [None]:
# watch the pipeline run status visually as it progresses (interrupt the kernel to stop watching)
pipeline_run.show(wait=True)

## Run the pipeline from the console UI

In [None]:
print("Ctrl-Click the hyperlink to open the pipeline run page in the OCI console UI")
print("https://cloud.oracle.com/data-science/pipelines/{}/pipeline-runs".format(pipeline.id))

#### Don't forget to set the environment varaibles when running the pipeline: DATA_LOCATION