Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
617e131
create pii operator
mingkang111 Oct 26, 2023
7eaa107
sync basic setup
mingkang111 Oct 30, 2023
3f929f0
updated default schema
mingkang111 Nov 2, 2023
a6bd239
wip
mingkang111 Nov 13, 2023
fdfda60
fixed: change type name used in YAML to list
mingkang111 Nov 13, 2023
5e6d7c8
Merge branch 'feature/forecasting' of https://github.com/oracle/accel…
mingkang111 Nov 13, 2023
2fa52c1
added documentation
mingkang111 Nov 13, 2023
7461c94
wip
mingkang111 Nov 13, 2023
0fb1a4f
added tests
mingkang111 Nov 14, 2023
d9a5af1
added support for return html
mingkang111 Nov 14, 2023
cffa536
implements pii operator
mingkang111 Nov 14, 2023
434e286
fixed file name in readme
mingkang111 Nov 14, 2023
6a304d4
adjust documentation for pii operator
mingkang111 Nov 14, 2023
3fc9da6
fixed file name in readme
mingkang111 Nov 14, 2023
a5e0eb4
fixed file name in readme
mingkang111 Nov 14, 2023
486165b
added optional dependency for pii operator
mingkang111 Nov 14, 2023
cbcf362
Merge branch 'main' of https://github.com/oracle/accelerated-data-sci…
mingkang111 Nov 14, 2023
1a814f5
Merge branch 'feature/forecasting' of https://github.com/oracle/accel…
mingkang111 Nov 14, 2023
4601ddf
updated optional dependency for pii operator
mingkang111 Nov 14, 2023
fb2ba23
Merge branch 'feature/forecasting' of https://github.com/oracle/accel…
mingkang111 Nov 14, 2023
8e14121
resolved conflict
mingkang111 Nov 14, 2023
79d48ca
fixed typo
mingkang111 Nov 14, 2023
1b17d77
added pii into test dependency
mingkang111 Nov 14, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ads/common/decorator/runtime_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class OptionalDependency:
SPARK = "oracle-ads[spark]"
HUGGINGFACE = "oracle-ads[huggingface]"
FORECAST = "oracle-ads[forecast]"
PII = "oracle-ads[pii]"


def runtime_dependency(
Expand Down
5 changes: 4 additions & 1 deletion ads/data_labeling/mixin/data_labeling.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from typing import Dict, List
Expand Down Expand Up @@ -188,6 +188,7 @@ def render_ner(
content_column: str = "Content",
annotations_column: str = "Annotations",
limit: int = ROWS_TO_RENDER_LIMIT,
return_html: bool = False,
) -> None:
"""Renders NER dataset. Displays only first 50 rows.

Expand Down Expand Up @@ -223,6 +224,8 @@ def render_ner(
annotations_column=annotations_column,
)
result_html = text_visualizer.render(items=items, options=options)
if return_html:
return result_html

from IPython.core.display import HTML, Markdown, display

Expand Down
8 changes: 5 additions & 3 deletions ads/opctl/operator/common/operator_yaml_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _check_condition(
Returns
-------
bool
True if the condition fulfils, false otherwise.
True if the condition fulfills, false otherwise.
"""
for key, value in condition.items():
if key not in example or example[key] != value:
Expand All @@ -103,8 +103,9 @@ def _generate_example(
The result config.
"""
example = {}

for key, value in schema.items():
# only generate values fro required fields
# only generate values for required fields
if (
value.get("required", False)
or value.get("dependencies", False)
Expand All @@ -125,7 +126,8 @@ def _generate_example(
example[key] = 1
elif data_type == "boolean":
example[key] = True
elif data_type == "array":
elif data_type == "list":
# TODO: Handle list of dict
example[key] = ["item1", "item2"]
elif data_type == "dict":
example[key] = self._generate_example(
Expand Down
8 changes: 6 additions & 2 deletions ads/opctl/operator/lowcode/pii/MLoperator
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ conda: pii_v1
gpu: no
keywords:
- PII
backends: []
- Spacy
backends:
- job
description: |
PII operator..."
PII operator, that detects detect and redact Personally Identifiable Information
(PII) data in datasets by combining pattern match and machine learning solution.
Use `ads operator info -t pii` to get more details about the pii operator."
147 changes: 125 additions & 22 deletions ads/opctl/operator/lowcode/pii/README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
# PII Operator

The PII Operator ...

The PII Operator aims to detect and redact Personally Identifiable Information (PII) in datasets. PII data includes information such as names, addresses, and social security numbers, which can be used to identify individuals. This operator combine pattern matching and machine learning solution to identify PII, and then redacts or anonymizes it to protect the privacy of individuals.

Below are the steps to configure and run the PII Operator on different resources.

## 1. Prerequisites

Follow the [CLI Configuration](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) steps from the ADS documentation. This step is mandatory as it sets up default values for different options while running the PII Operator on OCI Data Science jobs or OCI Data Flow applications. If you have previously done this and used a flexible shape, make sure to adjust `ml_job_config.ini` with shape config details and `docker_registry` information.

- ocpus = 1
- memory_in_gbs = 16
- docker_registry = `<iad.ocir.io/namespace/>`
Follow the [CLI Configuration](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) steps from the ADS documentation. This step is mandatory as it sets up default values for different options while running the PII Operator on OCI Data Science jobs.

## 2. Generating configs

Expand All @@ -22,41 +19,48 @@ ads operator init -t pii --overwrite --output ~/pii/

The most important files expected to be generated are:

- `pii.yaml`: Contains PII-related configuration.
- `backend_operator_local_python_config.yaml`: This includes a local backend configuration for running PII in a local environment. The environment should be set up manually before running the operator.
- `backend_operator_local_container_config.yaml`: This includes a local backend configuration for running PII within a local container. The container should be built before running the operator. Please refer to the instructions below for details on how to accomplish this.
- `pii.yaml`: Contains pii-related configuration.
- `pii_operator_local_python.yaml`: This includes a local backend configuration for running pii operator in a local environment. The environment should be set up manually before running the operator.
- `pii_operator_local_container.yaml`: This includes a local backend configuration for running pii operator within a local container. The container should be built before running the operator. Please refer to the instructions below for details on how to accomplish this.
- `pii_job_container.yaml`: Contains Data Science job-related config to run pii operator in a Data Science job within a container (BYOC) runtime. The container should be built and published before running the operator. Please refer to the instructions below for details on how to accomplish this.
- `pii_job_python.yaml`: Contains Data Science job-related config to run pii operator in a Data Science job within a conda runtime. The conda should be built and published before running the operator.

All generated configurations should be ready to use without the need for any additional adjustments. However, they are provided as starter kit configurations that can be customized as needed.

## 3. Running PII on the local conda environment
## 3. Running Pii on the local conda environment

To run PII locally, create and activate a new conda environment (`ads-pii`). Install all the required libraries listed in the `environment.yaml` file.
To run pii operator locally, create and activate a new conda environment (`ads-pii`). Install all the required libraries listed in the `environment.yaml` file.

```yaml
- "git+https://github.com/oracle/accelerated-data-science.git@feature/pii#egg=oracle-ads"
- datapane
- scrubadub
- gender_guesser
- nameparser
- scrubadub_spacy
- "git+https://github.com/oracle/accelerated-data-science.git@feature/ads_pii_operator#egg=oracle-ads"
```

Please review the previously generated `pii.yaml` file using the `init` command, and make any necessary adjustments to the input and output file locations. By default, it assumes that the files should be located in the same folder from which the `init` command was executed.

Use the command below to verify the PII config.
Use the command below to verify the pii config.

```bash
ads operator verify -f ~/pii/pii.yaml
```

Use the following command to run the PII within the `ads-pii` conda environment.
Use the following command to run the pii operator within the `ads-pii` conda environment.

```bash
ads operator run -f ~/pii/pii.yaml -b local
```

The operator will run in your local environment without requiring any additional modifications.

## 4. Running PII on the local container
## 4. Running pii on the local container

To run the PII operator within a local container, follow these steps:
To run the pii operator within a local container, follow these steps:

Use the command below to build the PII container.
Use the command below to build the pii container.

```bash
ads operator build-image -t pii
Expand All @@ -65,14 +69,14 @@ ads operator build-image -t pii
This will create a new `pii:v1` image, with `/etc/operator` as the designated working directory within the container.


Check the `backend_operator_local_container_config.yaml` config file. By default, it should have a `volume` section with the `.oci` configs folder mounted.
Check the `pii_operator_local_container.yaml` config file. By default, it should have a `volume` section with the `.oci` configs folder mounted.

```yaml
volume:
- "/Users/<user>/.oci:/root/.oci"
```

Mounting the OCI configs folder is only required if an OCI Object Storage bucket will be used to store the input PII data or output PII result. The input/output folders can also be mounted to the container.
Mounting the OCI configs folder is only required if an OCI Object Storage bucket will be used to store the input data or output result. The input/output folders can also be mounted to the container.

```yaml
volume:
Expand All @@ -85,7 +89,7 @@ The full config can look like:
```yaml
kind: operator.local
spec:
image: PII:v1
image: pii:v1
volume:
- /Users/<user>/.oci:/root/.oci
- /Users/<user>/pii/data:/etc/operator/data
Expand All @@ -94,8 +98,107 @@ type: container
version: v1
```

Run the PII within a container using the command below:
Run the pii operator within a container using the command below:

```bash
ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_operator_local_container.yaml
```

## 5. Running pii in the Data Science job within container runtime

To execute the pii operator within a Data Science job using container runtime, please follow the steps outlined below:

You can use the following command to build the forecast container. This step can be skipped if you have already done this for running the operator within a local container.

```bash
ads operator build-image -t pii
```

This will create a new `pii:v1` image, with `/etc/operator` as the designated working directory within the container.

Publish the `pii:v1` container to the [Oracle Container Registry](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/home.htm). To become familiar with OCI, read the documentation links posted below.

- [Access Container Registry](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Concepts/registryoverview.htm#access)
- [Create repositories](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Tasks/registrycreatingarepository.htm#top)
- [Push images](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Tasks/registrypushingimagesusingthedockercli.htm#Pushing_Images_Using_the_Docker_CLI)

To publish `pii:v1` to OCR, use the command posted below:

```bash
ads operator publish-image pii:v1 --registry <iad.ocir.io/tenancy/>
```

After the container is published to OCR, it can be used within Data Science jobs service. Check the `backend_job_container_config.yaml` config file. It should contain pre-populated infrastructure and runtime sections. The runtime section should contain an image property, something like `image: iad.ocir.io/<tenancy>/pii:v1`. More details about supported options can be found in the ADS Jobs documentation - [Run a Container](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/run_container.html).

Adjust the `pii.yaml` config with proper input/output folders. When the operator is run in the Data Science job, it will not have access to local folders. Therefore, input data and output folders should be placed in the Object Storage bucket. Open the `pii.yaml` and adjust the following fields:

```yaml
input_data:
url: oci://bucket@namespace/pii/input_data/data.csv
output_directory:
url: oci://bucket@namespace/pii/result/
```

Run the pii operator on the Data Science jobs using the command posted below:

```bash
ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_job_container.yaml
```

The logs can be monitored using the `ads opctl watch` command.

```bash
ads opctl watch <OCID>
```


## 6. Running pii in the Data Science job within conda runtime

To execute the pii operator within a Data Science job using conda runtime, please follow the steps outlined below:

You can use the following command to build the pii conda environment.

```bash
ads operator build-conda -t pii
```

This will create a new `pii_v1` conda environment and place it in the folder specified within `ads opctl configure` command.

Use the command below to Publish the `pii_v1` conda environment to the Object Storage bucket.

```bash
ads opctl conda publish pii_v1
```
More details about configuring CLI can be found here - [Configuring CLI](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html)


After the conda environment is published to Object Storage, it can be used within Data Science jobs service. Check the `pii_job_python.yaml` config file. It should contain pre-populated infrastructure and runtime sections. The runtime section should contain a `conda` section.

```yaml
conda:
type: published
uri: oci://bucket@namespace/conda_environments/cpu/pii/1/pii_v1
```

More details about supported options can be found in the ADS Jobs documentation - [Run a Python Workload](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/run_python.html).

Adjust the `pii.yaml` config with proper input/output folders. When the pii is run in the Data Science job, it will not have access to local folders. Therefore, input data and output folders should be placed in the Object Storage bucket. Open the `pii.yaml` and adjust the following fields:

```yaml
input_data:
url: oci://bucket@namespace/pii/input_data/data.csv
output_directory:
url: oci://bucket@namespace/pii/result/
```

Run the pii on the Data Science jobs using the command posted below:

```bash
ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_job_python.yaml
```

The logs can be monitored using the `ads opctl watch` command.

```bash
ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/backend_operator_local_container_config.yaml
ads opctl watch <OCID>
```
13 changes: 7 additions & 6 deletions ads/opctl/operator/lowcode/pii/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,19 @@
from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS
from ads.opctl.operator.common.utils import _parse_input_args

from .operator_config import PIIOperatorConfig
from .model.guardrails import PIIGuardrail
from .operator_config import PiiOperatorConfig


def operate(operator_config: PIIOperatorConfig) -> None:
def operate(operator_config: PiiOperatorConfig) -> None:
"""Runs the PII operator."""

print("The operator is running...")
guard = PIIGuardrail(config=operator_config)
guard.process()


def verify(spec: Dict, **kwargs: Dict) -> bool:
"""Verifies the PII operator config."""
operator = PIIOperatorConfig.from_dict(spec)
operator = PiiOperatorConfig.from_dict(spec)
msg_header = (
f"{'*' * 30} The operator config has been successfully verified {'*' * 30}"
)
Expand Down Expand Up @@ -59,7 +60,7 @@ def main(raw_args: List[str]):
except:
yaml_string = operator_spec_str

operator_config = PIIOperatorConfig.from_yaml(
operator_config = PiiOperatorConfig.from_yaml(
uri=args.file,
yaml_string=yaml_string,
)
Expand Down
10 changes: 6 additions & 4 deletions ads/opctl/operator/lowcode/pii/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@

from typing import Dict

import click

from ads.opctl import logger
from ads.opctl.operator.common.utils import _load_yaml_from_uri
from ads.opctl.operator.common.operator_yaml_generator import YamlGenerator
from ads.opctl.operator.common.utils import _load_yaml_from_uri


def init(**kwargs: Dict) -> str:
Expand All @@ -32,6 +30,10 @@ def init(**kwargs: Dict) -> str:
"""
logger.info("==== PII related options ====")

default_detector = [{"name": "<type>.<entity>", "action": "mask"}]

return YamlGenerator(
schema=_load_yaml_from_uri(__file__.replace("cmd.py", "schema.yaml"))
).generate_example_dict(values={"type": kwargs.get("type")})
).generate_example_dict(
values={"type": kwargs.get("type"), "detectors": default_detector}
)
Loading