From 8eebe45aa3f95959719287f261d40da4b3f2d7f9 Mon Sep 17 00:00:00 2001 From: Dmitrii Cherkasov Date: Fri, 27 Oct 2023 10:59:00 -0700 Subject: [PATCH 1/7] PII operator backbone. --- ads/opctl/operator/cmd.py | 5 +- ads/opctl/operator/lowcode/pii/MLoperator | 11 ++ ads/opctl/operator/lowcode/pii/README.md | 101 ++++++++++++++++ ads/opctl/operator/lowcode/pii/__init__.py | 5 + ads/opctl/operator/lowcode/pii/__main__.py | 77 ++++++++++++ ads/opctl/operator/lowcode/pii/cmd.py | 37 ++++++ ads/opctl/operator/lowcode/pii/const.py | 5 + .../operator/lowcode/pii/environment.yaml | 8 ++ ads/opctl/operator/lowcode/pii/errors.py | 27 +++++ .../operator/lowcode/pii/model/__init__.py | 5 + .../operator/lowcode/pii/operator_config.py | 81 +++++++++++++ ads/opctl/operator/lowcode/pii/schema.yaml | 114 ++++++++++++++++++ ads/opctl/operator/lowcode/pii/utils.py | 5 + 13 files changed, 479 insertions(+), 2 deletions(-) create mode 100644 ads/opctl/operator/lowcode/pii/MLoperator create mode 100644 ads/opctl/operator/lowcode/pii/README.md create mode 100644 ads/opctl/operator/lowcode/pii/__init__.py create mode 100644 ads/opctl/operator/lowcode/pii/__main__.py create mode 100644 ads/opctl/operator/lowcode/pii/cmd.py create mode 100644 ads/opctl/operator/lowcode/pii/const.py create mode 100644 ads/opctl/operator/lowcode/pii/environment.yaml create mode 100644 ads/opctl/operator/lowcode/pii/errors.py create mode 100644 ads/opctl/operator/lowcode/pii/model/__init__.py create mode 100644 ads/opctl/operator/lowcode/pii/operator_config.py create mode 100644 ads/opctl/operator/lowcode/pii/schema.yaml create mode 100644 ads/opctl/operator/lowcode/pii/utils.py diff --git a/ads/opctl/operator/cmd.py b/ads/opctl/operator/cmd.py index 0efcc20d5..b6cf9b0a6 100644 --- a/ads/opctl/operator/cmd.py +++ b/ads/opctl/operator/cmd.py @@ -179,9 +179,10 @@ def init( ) as f: f.write(yaml.dump(operator_config)) except Exception as ex: - logger.info( + logger.warning( "The operator's specification was not generated " - f"because it is not supported by the `{operator_info.type}` operator." + f"because it is not supported by the `{operator_info.type}` operator. " + "Use --debug option to see the error details." ) logger.debug(ex) diff --git a/ads/opctl/operator/lowcode/pii/MLoperator b/ads/opctl/operator/lowcode/pii/MLoperator new file mode 100644 index 000000000..dd5aed155 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/MLoperator @@ -0,0 +1,11 @@ +type: pii +version: v1 +name: PII Operator +conda_type: published +conda: pii_v1 +gpu: no +keywords: + - PII +backends: [] +description: | + PII operator..." diff --git a/ads/opctl/operator/lowcode/pii/README.md b/ads/opctl/operator/lowcode/pii/README.md new file mode 100644 index 000000000..60164d25c --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/README.md @@ -0,0 +1,101 @@ +# PII Operator + +The PII Operator ... + +Below are the steps to configure and run the PII Operator on different resources. + +## 1. Prerequisites + +Follow the [CLI Configuration](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) steps from the ADS documentation. This step is mandatory as it sets up default values for different options while running the PII Operator on OCI Data Science jobs or OCI Data Flow applications. If you have previously done this and used a flexible shape, make sure to adjust `ml_job_config.ini` with shape config details and `docker_registry` information. + +- ocpus = 1 +- memory_in_gbs = 16 +- docker_registry = `` + +## 2. Generating configs + +To generate starter configs, run the command below. This will create a list of YAML configs and place them in the `output` folder. + +```bash +ads operator init -t pii --overwrite --output ~/pii/ +``` + +The most important files expected to be generated are: + +- `pii.yaml`: Contains PII-related configuration. +- `backend_operator_local_python_config.yaml`: This includes a local backend configuration for running PII in a local environment. The environment should be set up manually before running the operator. +- `backend_operator_local_container_config.yaml`: This includes a local backend configuration for running PII within a local container. The container should be built before running the operator. Please refer to the instructions below for details on how to accomplish this. + +All generated configurations should be ready to use without the need for any additional adjustments. However, they are provided as starter kit configurations that can be customized as needed. + +## 3. Running PII on the local conda environment + +To run PII locally, create and activate a new conda environment (`ads-pii`). Install all the required libraries listed in the `environment.yaml` file. + +```yaml +- "git+https://github.com/oracle/accelerated-data-science.git@feature/pii#egg=oracle-ads" +``` + +Please review the previously generated `pii.yaml` file using the `init` command, and make any necessary adjustments to the input and output file locations. By default, it assumes that the files should be located in the same folder from which the `init` command was executed. + +Use the command below to verify the PII config. + +```bash +ads operator verify -f ~/pii/pii.yaml +``` + +Use the following command to run the PII within the `ads-pii` conda environment. + +```bash +ads operator run -f ~/pii/pii.yaml -b local +``` + +The operator will run in your local environment without requiring any additional modifications. + +## 4. Running PII on the local container + +To run the PII operator within a local container, follow these steps: + +Use the command below to build the PII container. + +```bash +ads operator build-image -t pii +``` + +This will create a new `pii:v1` image, with `/etc/operator` as the designated working directory within the container. + + +Check the `backend_operator_local_container_config.yaml` config file. By default, it should have a `volume` section with the `.oci` configs folder mounted. + +```yaml +volume: + - "/Users//.oci:/root/.oci" +``` + +Mounting the OCI configs folder is only required if an OCI Object Storage bucket will be used to store the input PII data or output PII result. The input/output folders can also be mounted to the container. + +```yaml +volume: + - /Users//.oci:/root/.oci + - /Users//pii/data:/etc/operator/data + - /Users//pii/result:/etc/operator/result +``` + +The full config can look like: +```yaml +kind: operator.local +spec: + image: PII:v1 + volume: + - /Users//.oci:/root/.oci + - /Users//pii/data:/etc/operator/data + - /Users//pii/result:/etc/operator/result +type: container +version: v1 +``` + +Run the PII within a container using the command below: + +```bash +ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/backend_operator_local_container_config.yaml +``` diff --git a/ads/opctl/operator/lowcode/pii/__init__.py b/ads/opctl/operator/lowcode/pii/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/lowcode/pii/__main__.py b/ads/opctl/operator/lowcode/pii/__main__.py new file mode 100644 index 000000000..aa1c31e38 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/__main__.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import json +import os +import sys +from typing import Dict, List + +import yaml + +from ads.opctl import logger +from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS +from ads.opctl.operator.common.utils import _parse_input_args + +from .operator_config import PIIOperatorConfig + + +def operate(operator_config: PIIOperatorConfig) -> None: + """Runs the PII operator.""" + + print("The operator is running...") + + +def verify(spec: Dict, **kwargs: Dict) -> bool: + """Verifies the PII operator config.""" + operator = PIIOperatorConfig.from_dict(spec) + msg_header = ( + f"{'*' * 30} The operator config has been successfully verified {'*' * 30}" + ) + print(msg_header) + print(operator.to_yaml()) + print("*" * len(msg_header)) + + +def main(raw_args: List[str]): + """The entry point of the PII the operator.""" + args, _ = _parse_input_args(raw_args) + if not args.file and not args.spec and not os.environ.get(ENV_OPERATOR_ARGS): + logger.info( + "Please specify -f[--file] or -s[--spec] or " + f"pass operator's arguments via {ENV_OPERATOR_ARGS} environment variable." + ) + return + + logger.info("-" * 100) + logger.info(f"{'Running' if not args.verify else 'Verifying'} the operator...") + + # if spec provided as input string, then convert the string into YAML + yaml_string = "" + if args.spec or os.environ.get(ENV_OPERATOR_ARGS): + operator_spec_str = args.spec or os.environ.get(ENV_OPERATOR_ARGS) + try: + yaml_string = yaml.safe_dump(json.loads(operator_spec_str)) + except json.JSONDecodeError: + yaml_string = yaml.safe_dump(yaml.safe_load(operator_spec_str)) + except: + yaml_string = operator_spec_str + + operator_config = PIIOperatorConfig.from_yaml( + uri=args.file, + yaml_string=yaml_string, + ) + + logger.info(operator_config.to_yaml()) + + # run operator + if args.verify: + verify(operator_config) + else: + operate(operator_config) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/ads/opctl/operator/lowcode/pii/cmd.py b/ads/opctl/operator/lowcode/pii/cmd.py new file mode 100644 index 000000000..f76b5faaf --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/cmd.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from typing import Dict + +import click + +from ads.opctl import logger +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.common.operator_yaml_generator import YamlGenerator + + +def init(**kwargs: Dict) -> str: + """ + Generates operator config by the schema. + + Properties + ---------- + kwargs: (Dict, optional). + Additional key value arguments. + + - type: str + The type of the operator. + + Returns + ------- + str + The YAML specification generated based on the schema. + """ + logger.info("==== PII related options ====") + + return YamlGenerator( + schema=_load_yaml_from_uri(__file__.replace("cmd.py", "schema.yaml")) + ).generate_example_dict(values={"type": kwargs.get("type")}) diff --git a/ads/opctl/operator/lowcode/pii/const.py b/ads/opctl/operator/lowcode/pii/const.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/const.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/lowcode/pii/environment.yaml b/ads/opctl/operator/lowcode/pii/environment.yaml new file mode 100644 index 000000000..a52cc5949 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/environment.yaml @@ -0,0 +1,8 @@ +name: PII +channels: + - conda-forge +dependencies: + - python=3.8 + - pip + - pip: + - "git+https://github.com/oracle/accelerated-data-science.git@feature/pii#egg=oracle-ads" diff --git a/ads/opctl/operator/lowcode/pii/errors.py b/ads/opctl/operator/lowcode/pii/errors.py new file mode 100644 index 000000000..73aadaf46 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/errors.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class PIISchemaYamlError(Exception): + """Exception raised when there is an issue with the schema.""" + + def __init__(self, error: str): + super().__init__( + "Invalid PII operator specification. Check the YAML structure and ensure it " + "complies with the required schema for PII operator. \n" + f"{error}" + ) + + +class PIIInputDataError(Exception): + """Exception raised when there is an issue with input data.""" + + def __init__(self, error: str): + super().__init__( + "Invalid input data. Check the input data and ensure it " + "complies with the validation criteria. \n" + f"{error}" + ) diff --git a/ads/opctl/operator/lowcode/pii/model/__init__.py b/ads/opctl/operator/lowcode/pii/model/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/lowcode/pii/operator_config.py b/ads/opctl/operator/lowcode/pii/operator_config.py new file mode 100644 index 000000000..1ab8bf96c --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/operator_config.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +from dataclasses import dataclass, field +from typing import Dict, List + +from ads.common.serializer import DataClassSerializable +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.common.operator_config import OperatorConfig + + +@dataclass(repr=True) +class InputData(DataClassSerializable): + """Class representing operator specification input data details.""" + + format: str = None + columns: List[str] = None + url: str = None + options: Dict = None + limit: int = None + + +@dataclass(repr=True) +class OutputDirectory(DataClassSerializable): + """Class representing operator specification output directory details.""" + + connect_args: Dict = None + format: str = None + url: str = None + name: str = None + options: Dict = None + + +@dataclass(repr=True) +class PIIOperatorSpec(DataClassSerializable): + """Class representing PII operator specification.""" + + name: str = None + input_data: InputData = field(default_factory=InputData) + output_directory: OutputDirectory = field(default_factory=OutputDirectory) + report_file_name: str = None + report_title: str = None + report_theme: str = None + + def __post_init__(self): + """Adjusts the specification details.""" + self.report_file_name = self.report_file_name or "report.html" + self.report_theme = self.report_theme or "light" + + +@dataclass(repr=True) +class PIIOperatorConfig(OperatorConfig): + """Class representing PII operator config. + + Attributes + ---------- + kind: str + The kind of the resource. For operators it is always - `operator`. + type: str + The type of the operator. For PII operator it is always - `PII` + version: str + The version of the operator. + spec: PIIOperatorSpec + The PII operator specification. + """ + + kind: str = "operator" + type: str = "PII" + version: str = "v1" + spec: PIIOperatorSpec = field(default_factory=PIIOperatorSpec) + + @classmethod + def _load_schema(cls) -> str: + """Loads operator schema.""" + return _load_yaml_from_uri( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "schema.yaml") + ) diff --git a/ads/opctl/operator/lowcode/pii/schema.yaml b/ads/opctl/operator/lowcode/pii/schema.yaml new file mode 100644 index 000000000..1fe39b258 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/schema.yaml @@ -0,0 +1,114 @@ +kind: + allowed: + - operator + required: true + type: string + default: operator + +version: + allowed: + - "v1" + required: true + type: string + default: v1 + meta: + description: "Operators may change yaml file schemas from version to version, as well as implementation details. Double check the version to ensure compatibility." + +type: + required: true + type: string + default: pii + meta: + description: "Type should always be `pii` when using a PII operator" + +spec: + required: true + schema: + name: + required: false + type: string + default: PII + report_file_name: + required: false + type: string + default: report.html + meta: + description: "Placed into output_directory location. Defaults to report.html" + report_title: + required: false + type: string + default: PII Report + report_theme: + required: false + type: string + default: light + allowed: + - light + - dark + input_data: + required: true + type: dict + meta: + description: "The input data for the PII." + schema: + format: + allowed: + - csv + - json + - clipboard + - excel + - hdf + - feather + - load_files + required: false + type: string + columns: + required: false + type: list + schema: + type: string + options: + nullable: true + required: false + type: dict + url: + required: true + type: string + default: data.csv + meta: + description: "The url can be local, or remote. For example: `oci://@/data.csv`" + limit: + required: false + type: integer + output_directory: + required: false + schema: + connect_args: + nullable: true + required: false + type: dict + format: + required: false + type: string + allowed: + - csv + - json + - clipboard + - excel + - hdf + - sql + url: + required: true + type: string + default: result/ + meta: + description: "The url can be local, or remote. For example: `oci://@/`" + name: + required: false + type: string + options: + nullable: true + required: false + type: dict + type: dict + type: dict diff --git a/ads/opctl/operator/lowcode/pii/utils.py b/ads/opctl/operator/lowcode/pii/utils.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/utils.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ From 3677198ece5e9d69d56044109fee59f79878f524 Mon Sep 17 00:00:00 2001 From: MING KANG Date: Tue, 14 Nov 2023 15:52:21 -0800 Subject: [PATCH 2/7] feature/ads_pii_operator (#424) --- ads/common/decorator/runtime_dependency.py | 1 + ads/data_labeling/mixin/data_labeling.py | 5 +- .../common/operator_yaml_generator.py | 8 +- ads/opctl/operator/lowcode/pii/MLoperator | 8 +- ads/opctl/operator/lowcode/pii/README.md | 147 +++++- ads/opctl/operator/lowcode/pii/__main__.py | 13 +- ads/opctl/operator/lowcode/pii/cmd.py | 10 +- ads/opctl/operator/lowcode/pii/constant.py | 85 +++ .../operator/lowcode/pii/environment.yaml | 13 +- .../operator/lowcode/pii/model/factory.py | 80 +++ .../operator/lowcode/pii/model/guardrails.py | 164 ++++++ ads/opctl/operator/lowcode/pii/model/pii.py | 145 ++++++ .../lowcode/pii/model/processor/__init__.py | 34 ++ .../pii/model/processor/email_replacer.py | 34 ++ .../pii/model/processor/mbi_replacer.py | 35 ++ .../pii/model/processor/name_replacer.py | 225 ++++++++ .../pii/model/processor/number_replacer.py | 73 +++ .../lowcode/pii/model/processor/remover.py | 26 + .../operator/lowcode/pii/model/report.py | 489 ++++++++++++++++++ .../operator/lowcode/pii/operator_config.py | 65 ++- ads/opctl/operator/lowcode/pii/schema.yaml | 122 +++-- ads/opctl/operator/lowcode/pii/utils.py | 120 +++++ dev-requirements.txt | 2 +- docs/source/index.rst | 1 + .../common/yaml_schema/piiOperator.yaml | 108 ++++ .../operators/pii_operator/examples.rst | 53 ++ .../pii_operator/getting_started.rst | 64 +++ .../operators/pii_operator/index.rst | 37 ++ .../operators/pii_operator/install.rst | 13 + .../user_guide/operators/pii_operator/pii.rst | 47 ++ .../operators/pii_operator/yaml_schema.rst | 9 + pyproject.toml | 12 +- .../with_extras/operator/pii/__init__.py | 1 - .../with_extras/operator/pii/test_factory.py | 46 ++ .../operator/pii/test_files/__init__.py | 4 + .../operator/pii/test_files/pii_test.yaml | 14 + .../operator/pii/test_files/test_data.csv | 3 + .../operator/pii/test_guardrail.py | 120 +++++ .../operator/pii/test_pii_scrubber.py | 53 ++ 39 files changed, 2359 insertions(+), 130 deletions(-) create mode 100644 ads/opctl/operator/lowcode/pii/constant.py create mode 100644 ads/opctl/operator/lowcode/pii/model/factory.py create mode 100644 ads/opctl/operator/lowcode/pii/model/guardrails.py create mode 100644 ads/opctl/operator/lowcode/pii/model/pii.py create mode 100644 ads/opctl/operator/lowcode/pii/model/processor/__init__.py create mode 100644 ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py create mode 100644 ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py create mode 100644 ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py create mode 100644 ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py create mode 100644 ads/opctl/operator/lowcode/pii/model/processor/remover.py create mode 100644 ads/opctl/operator/lowcode/pii/model/report.py create mode 100644 docs/source/user_guide/operators/common/yaml_schema/piiOperator.yaml create mode 100644 docs/source/user_guide/operators/pii_operator/examples.rst create mode 100644 docs/source/user_guide/operators/pii_operator/getting_started.rst create mode 100644 docs/source/user_guide/operators/pii_operator/index.rst create mode 100644 docs/source/user_guide/operators/pii_operator/install.rst create mode 100644 docs/source/user_guide/operators/pii_operator/pii.rst create mode 100644 docs/source/user_guide/operators/pii_operator/yaml_schema.rst rename ads/opctl/operator/lowcode/pii/const.py => tests/unitary/with_extras/operator/pii/__init__.py (87%) create mode 100644 tests/unitary/with_extras/operator/pii/test_factory.py create mode 100644 tests/unitary/with_extras/operator/pii/test_files/__init__.py create mode 100644 tests/unitary/with_extras/operator/pii/test_files/pii_test.yaml create mode 100644 tests/unitary/with_extras/operator/pii/test_files/test_data.csv create mode 100644 tests/unitary/with_extras/operator/pii/test_guardrail.py create mode 100644 tests/unitary/with_extras/operator/pii/test_pii_scrubber.py diff --git a/ads/common/decorator/runtime_dependency.py b/ads/common/decorator/runtime_dependency.py index 08ae48e78..27473ae9a 100644 --- a/ads/common/decorator/runtime_dependency.py +++ b/ads/common/decorator/runtime_dependency.py @@ -65,6 +65,7 @@ class OptionalDependency: SPARK = "oracle-ads[spark]" HUGGINGFACE = "oracle-ads[huggingface]" FORECAST = "oracle-ads[forecast]" + PII = "oracle-ads[pii]" def runtime_dependency( diff --git a/ads/data_labeling/mixin/data_labeling.py b/ads/data_labeling/mixin/data_labeling.py index 56f85f3a9..e2c65eb20 100644 --- a/ads/data_labeling/mixin/data_labeling.py +++ b/ads/data_labeling/mixin/data_labeling.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ from typing import Dict, List @@ -188,6 +188,7 @@ def render_ner( content_column: str = "Content", annotations_column: str = "Annotations", limit: int = ROWS_TO_RENDER_LIMIT, + return_html: bool = False, ) -> None: """Renders NER dataset. Displays only first 50 rows. @@ -223,6 +224,8 @@ def render_ner( annotations_column=annotations_column, ) result_html = text_visualizer.render(items=items, options=options) + if return_html: + return result_html from IPython.core.display import HTML, Markdown, display diff --git a/ads/opctl/operator/common/operator_yaml_generator.py b/ads/opctl/operator/common/operator_yaml_generator.py index b2b9e2823..1bbc1ae03 100644 --- a/ads/opctl/operator/common/operator_yaml_generator.py +++ b/ads/opctl/operator/common/operator_yaml_generator.py @@ -76,7 +76,7 @@ def _check_condition( Returns ------- bool - True if the condition fulfils, false otherwise. + True if the condition fulfills, false otherwise. """ for key, value in condition.items(): if key not in example or example[key] != value: @@ -103,8 +103,9 @@ def _generate_example( The result config. """ example = {} + for key, value in schema.items(): - # only generate values fro required fields + # only generate values for required fields if ( value.get("required", False) or value.get("dependencies", False) @@ -125,7 +126,8 @@ def _generate_example( example[key] = 1 elif data_type == "boolean": example[key] = True - elif data_type == "array": + elif data_type == "list": + # TODO: Handle list of dict example[key] = ["item1", "item2"] elif data_type == "dict": example[key] = self._generate_example( diff --git a/ads/opctl/operator/lowcode/pii/MLoperator b/ads/opctl/operator/lowcode/pii/MLoperator index dd5aed155..49dafdb5a 100644 --- a/ads/opctl/operator/lowcode/pii/MLoperator +++ b/ads/opctl/operator/lowcode/pii/MLoperator @@ -6,6 +6,10 @@ conda: pii_v1 gpu: no keywords: - PII -backends: [] + - Spacy +backends: + - job description: | - PII operator..." + PII operator, that detects detect and redact Personally Identifiable Information + (PII) data in datasets by combining pattern match and machine learning solution. + Use `ads operator info -t pii` to get more details about the pii operator." diff --git a/ads/opctl/operator/lowcode/pii/README.md b/ads/opctl/operator/lowcode/pii/README.md index 60164d25c..156646ef4 100644 --- a/ads/opctl/operator/lowcode/pii/README.md +++ b/ads/opctl/operator/lowcode/pii/README.md @@ -1,16 +1,13 @@ # PII Operator -The PII Operator ... + +The PII Operator aims to detect and redact Personally Identifiable Information (PII) in datasets. PII data includes information such as names, addresses, and social security numbers, which can be used to identify individuals. This operator combine pattern matching and machine learning solution to identify PII, and then redacts or anonymizes it to protect the privacy of individuals. Below are the steps to configure and run the PII Operator on different resources. ## 1. Prerequisites -Follow the [CLI Configuration](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) steps from the ADS documentation. This step is mandatory as it sets up default values for different options while running the PII Operator on OCI Data Science jobs or OCI Data Flow applications. If you have previously done this and used a flexible shape, make sure to adjust `ml_job_config.ini` with shape config details and `docker_registry` information. - -- ocpus = 1 -- memory_in_gbs = 16 -- docker_registry = `` +Follow the [CLI Configuration](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) steps from the ADS documentation. This step is mandatory as it sets up default values for different options while running the PII Operator on OCI Data Science jobs. ## 2. Generating configs @@ -22,29 +19,36 @@ ads operator init -t pii --overwrite --output ~/pii/ The most important files expected to be generated are: -- `pii.yaml`: Contains PII-related configuration. -- `backend_operator_local_python_config.yaml`: This includes a local backend configuration for running PII in a local environment. The environment should be set up manually before running the operator. -- `backend_operator_local_container_config.yaml`: This includes a local backend configuration for running PII within a local container. The container should be built before running the operator. Please refer to the instructions below for details on how to accomplish this. +- `pii.yaml`: Contains pii-related configuration. +- `pii_operator_local_python.yaml`: This includes a local backend configuration for running pii operator in a local environment. The environment should be set up manually before running the operator. +- `pii_operator_local_container.yaml`: This includes a local backend configuration for running pii operator within a local container. The container should be built before running the operator. Please refer to the instructions below for details on how to accomplish this. +- `pii_job_container.yaml`: Contains Data Science job-related config to run pii operator in a Data Science job within a container (BYOC) runtime. The container should be built and published before running the operator. Please refer to the instructions below for details on how to accomplish this. +- `pii_job_python.yaml`: Contains Data Science job-related config to run pii operator in a Data Science job within a conda runtime. The conda should be built and published before running the operator. All generated configurations should be ready to use without the need for any additional adjustments. However, they are provided as starter kit configurations that can be customized as needed. -## 3. Running PII on the local conda environment +## 3. Running Pii on the local conda environment -To run PII locally, create and activate a new conda environment (`ads-pii`). Install all the required libraries listed in the `environment.yaml` file. +To run pii operator locally, create and activate a new conda environment (`ads-pii`). Install all the required libraries listed in the `environment.yaml` file. ```yaml -- "git+https://github.com/oracle/accelerated-data-science.git@feature/pii#egg=oracle-ads" +- datapane +- scrubadub +- gender_guesser +- nameparser +- scrubadub_spacy +- "git+https://github.com/oracle/accelerated-data-science.git@feature/ads_pii_operator#egg=oracle-ads" ``` Please review the previously generated `pii.yaml` file using the `init` command, and make any necessary adjustments to the input and output file locations. By default, it assumes that the files should be located in the same folder from which the `init` command was executed. -Use the command below to verify the PII config. +Use the command below to verify the pii config. ```bash ads operator verify -f ~/pii/pii.yaml ``` -Use the following command to run the PII within the `ads-pii` conda environment. +Use the following command to run the pii operator within the `ads-pii` conda environment. ```bash ads operator run -f ~/pii/pii.yaml -b local @@ -52,11 +56,11 @@ ads operator run -f ~/pii/pii.yaml -b local The operator will run in your local environment without requiring any additional modifications. -## 4. Running PII on the local container +## 4. Running pii on the local container -To run the PII operator within a local container, follow these steps: +To run the pii operator within a local container, follow these steps: -Use the command below to build the PII container. +Use the command below to build the pii container. ```bash ads operator build-image -t pii @@ -65,14 +69,14 @@ ads operator build-image -t pii This will create a new `pii:v1` image, with `/etc/operator` as the designated working directory within the container. -Check the `backend_operator_local_container_config.yaml` config file. By default, it should have a `volume` section with the `.oci` configs folder mounted. +Check the `pii_operator_local_container.yaml` config file. By default, it should have a `volume` section with the `.oci` configs folder mounted. ```yaml volume: - "/Users//.oci:/root/.oci" ``` -Mounting the OCI configs folder is only required if an OCI Object Storage bucket will be used to store the input PII data or output PII result. The input/output folders can also be mounted to the container. +Mounting the OCI configs folder is only required if an OCI Object Storage bucket will be used to store the input data or output result. The input/output folders can also be mounted to the container. ```yaml volume: @@ -85,7 +89,7 @@ The full config can look like: ```yaml kind: operator.local spec: - image: PII:v1 + image: pii:v1 volume: - /Users//.oci:/root/.oci - /Users//pii/data:/etc/operator/data @@ -94,8 +98,107 @@ type: container version: v1 ``` -Run the PII within a container using the command below: +Run the pii operator within a container using the command below: + +```bash +ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_operator_local_container.yaml +``` + +## 5. Running pii in the Data Science job within container runtime + +To execute the pii operator within a Data Science job using container runtime, please follow the steps outlined below: + +You can use the following command to build the forecast container. This step can be skipped if you have already done this for running the operator within a local container. + +```bash +ads operator build-image -t pii +``` + +This will create a new `pii:v1` image, with `/etc/operator` as the designated working directory within the container. + +Publish the `pii:v1` container to the [Oracle Container Registry](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/home.htm). To become familiar with OCI, read the documentation links posted below. + +- [Access Container Registry](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Concepts/registryoverview.htm#access) +- [Create repositories](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Tasks/registrycreatingarepository.htm#top) +- [Push images](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Tasks/registrypushingimagesusingthedockercli.htm#Pushing_Images_Using_the_Docker_CLI) + +To publish `pii:v1` to OCR, use the command posted below: + +```bash +ads operator publish-image pii:v1 --registry +``` + +After the container is published to OCR, it can be used within Data Science jobs service. Check the `backend_job_container_config.yaml` config file. It should contain pre-populated infrastructure and runtime sections. The runtime section should contain an image property, something like `image: iad.ocir.io//pii:v1`. More details about supported options can be found in the ADS Jobs documentation - [Run a Container](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/run_container.html). + +Adjust the `pii.yaml` config with proper input/output folders. When the operator is run in the Data Science job, it will not have access to local folders. Therefore, input data and output folders should be placed in the Object Storage bucket. Open the `pii.yaml` and adjust the following fields: + +```yaml +input_data: + url: oci://bucket@namespace/pii/input_data/data.csv +output_directory: + url: oci://bucket@namespace/pii/result/ +``` + +Run the pii operator on the Data Science jobs using the command posted below: + +```bash +ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_job_container.yaml +``` + +The logs can be monitored using the `ads opctl watch` command. + +```bash +ads opctl watch +``` + + +## 6. Running pii in the Data Science job within conda runtime + +To execute the pii operator within a Data Science job using conda runtime, please follow the steps outlined below: + +You can use the following command to build the pii conda environment. + +```bash +ads operator build-conda -t pii +``` + +This will create a new `pii_v1` conda environment and place it in the folder specified within `ads opctl configure` command. + +Use the command below to Publish the `pii_v1` conda environment to the Object Storage bucket. + +```bash +ads opctl conda publish pii_v1 +``` +More details about configuring CLI can be found here - [Configuring CLI](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) + + +After the conda environment is published to Object Storage, it can be used within Data Science jobs service. Check the `pii_job_python.yaml` config file. It should contain pre-populated infrastructure and runtime sections. The runtime section should contain a `conda` section. + +```yaml +conda: + type: published + uri: oci://bucket@namespace/conda_environments/cpu/pii/1/pii_v1 +``` + +More details about supported options can be found in the ADS Jobs documentation - [Run a Python Workload](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/run_python.html). + +Adjust the `pii.yaml` config with proper input/output folders. When the pii is run in the Data Science job, it will not have access to local folders. Therefore, input data and output folders should be placed in the Object Storage bucket. Open the `pii.yaml` and adjust the following fields: + +```yaml +input_data: + url: oci://bucket@namespace/pii/input_data/data.csv +output_directory: + url: oci://bucket@namespace/pii/result/ +``` + +Run the pii on the Data Science jobs using the command posted below: + +```bash +ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_job_python.yaml +``` + +The logs can be monitored using the `ads opctl watch` command. ```bash -ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/backend_operator_local_container_config.yaml +ads opctl watch ``` diff --git a/ads/opctl/operator/lowcode/pii/__main__.py b/ads/opctl/operator/lowcode/pii/__main__.py index aa1c31e38..111b7ed3f 100644 --- a/ads/opctl/operator/lowcode/pii/__main__.py +++ b/ads/opctl/operator/lowcode/pii/__main__.py @@ -15,18 +15,19 @@ from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS from ads.opctl.operator.common.utils import _parse_input_args -from .operator_config import PIIOperatorConfig +from .model.guardrails import PIIGuardrail +from .operator_config import PiiOperatorConfig -def operate(operator_config: PIIOperatorConfig) -> None: +def operate(operator_config: PiiOperatorConfig) -> None: """Runs the PII operator.""" - - print("The operator is running...") + guard = PIIGuardrail(config=operator_config) + guard.process() def verify(spec: Dict, **kwargs: Dict) -> bool: """Verifies the PII operator config.""" - operator = PIIOperatorConfig.from_dict(spec) + operator = PiiOperatorConfig.from_dict(spec) msg_header = ( f"{'*' * 30} The operator config has been successfully verified {'*' * 30}" ) @@ -59,7 +60,7 @@ def main(raw_args: List[str]): except: yaml_string = operator_spec_str - operator_config = PIIOperatorConfig.from_yaml( + operator_config = PiiOperatorConfig.from_yaml( uri=args.file, yaml_string=yaml_string, ) diff --git a/ads/opctl/operator/lowcode/pii/cmd.py b/ads/opctl/operator/lowcode/pii/cmd.py index f76b5faaf..67bf14d27 100644 --- a/ads/opctl/operator/lowcode/pii/cmd.py +++ b/ads/opctl/operator/lowcode/pii/cmd.py @@ -6,11 +6,9 @@ from typing import Dict -import click - from ads.opctl import logger -from ads.opctl.operator.common.utils import _load_yaml_from_uri from ads.opctl.operator.common.operator_yaml_generator import YamlGenerator +from ads.opctl.operator.common.utils import _load_yaml_from_uri def init(**kwargs: Dict) -> str: @@ -32,6 +30,10 @@ def init(**kwargs: Dict) -> str: """ logger.info("==== PII related options ====") + default_detector = [{"name": ".", "action": "mask"}] + return YamlGenerator( schema=_load_yaml_from_uri(__file__.replace("cmd.py", "schema.yaml")) - ).generate_example_dict(values={"type": kwargs.get("type")}) + ).generate_example_dict( + values={"type": kwargs.get("type"), "detectors": default_detector} + ) diff --git a/ads/opctl/operator/lowcode/pii/constant.py b/ads/opctl/operator/lowcode/pii/constant.py new file mode 100644 index 000000000..5c75ae74c --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/constant.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from ads.common.extended_enum import ExtendedEnumMeta + +DEFAULT_SHOW_ROWS = 25 +DEFAULT_TIME_OUT = 5 +DEFAULT_COLOR = "#D6D3D1" +DEFAULT_REPORT_FILENAME = "report.html" +DEFAULT_TARGET_COLUMN = "target" + + +class SupportedAction(str, metaclass=ExtendedEnumMeta): + """Supported action to process detected entities.""" + + MASK = "mask" + REMOVE = "remove" + ANONYMIZE = "anonymize" + + +class SupportedDetector(str, metaclass=ExtendedEnumMeta): + """Supported pii detectors.""" + + DEFAULT = "default" + SPACY = "spacy" + + +class DataFrameColumn(str, metaclass=ExtendedEnumMeta): + REDACTED_TEXT: str = "redacted_text" + ENTITIES: str = "entities_cols" + + +class YamlKey(str, metaclass=ExtendedEnumMeta): + """Yaml key used in pii.yaml.""" + + pass + + +YAML_KEYS = [ + "detectors", + "custom_detectors", + "spacy_detectors", + "anonymization", + "name", + "label", + "patterns", + "model", + "named_entities", + "entities", +] + +################ +# Report Const # +################ +PII_REPORT_DESCRIPTION = ( + "This report will offer a comprehensive overview of the redaction of personal identifiable information (PII) from the provided data." + "The `Summary` section will provide an executive summary of this process, including key statistics, configuration, and model usage." + "The `Details` section will offer a more granular analysis of each row of data, including relevant statistics." +) +DETAILS_REPORT_DESCRIPTION = "The following report will show the details on each row. You can view the highlighted named entities and their labels in the text under `TEXT` tab." + +FLAT_UI_COLORS = [ + "#1ABC9C", + "#2ECC71", + "#3498DB", + "#9B59B6", + "#34495E", + "#16A085", + "#27AE60", + "#2980B9", + "#8E44AD", + "#2C3E50", + "#F1C40F", + "#E67E22", + "#E74C3C", + "#ECF0F1", + "#95A5A6", + "#F39C12", + "#D35400", + "#C0392B", + "#BDC3C7", + "#7F8C8D", +] diff --git a/ads/opctl/operator/lowcode/pii/environment.yaml b/ads/opctl/operator/lowcode/pii/environment.yaml index a52cc5949..ca5b65680 100644 --- a/ads/opctl/operator/lowcode/pii/environment.yaml +++ b/ads/opctl/operator/lowcode/pii/environment.yaml @@ -1,8 +1,15 @@ -name: PII +name: pii channels: - conda-forge dependencies: - - python=3.8 + - python=3.9 - pip - pip: - - "git+https://github.com/oracle/accelerated-data-science.git@feature/pii#egg=oracle-ads" + - aiohttp + - datapane + - gender_guesser + - nameparser + - plotly + - scrubadub + - scrubadub_spacy + - "git+https://github.com/oracle/accelerated-data-science.git@feature/ads_pii_operator#egg=oracle-ads" diff --git a/ads/opctl/operator/lowcode/pii/model/factory.py b/ads/opctl/operator/lowcode/pii/model/factory.py new file mode 100644 index 000000000..102204ea3 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/factory.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import uuid + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.opctl.operator.lowcode.pii.constant import SupportedDetector +from ads.opctl.operator.lowcode.pii.utils import construct_filth_cls_name + + +class UnSupportedDetectorError(Exception): + def __init__(self, dtype: str): + super().__init__( + f"Detector: `{dtype}` " + f"is not supported. Supported models: {SupportedDetector.values}" + ) + + +class PiiBaseDetector: + @classmethod + def construct(cls, **kwargs): + raise NotImplementedError + + +class BuiltInDetector(PiiBaseDetector): + @classmethod + def construct(cls, entity, **kwargs): + return entity + + +class SpacyDetector(PiiBaseDetector): + DEFAULT_SPACY_NAMED_ENTITIES = ["DATE", "FAC", "GPE", "LOC", "ORG", "PER", "PERSON"] + DEFAULT_SPACY_MODEL = "en_core_web_trf" + + @classmethod + @runtime_dependency(module="scrubadub", install_from=OptionalDependency.PII) + @runtime_dependency(module="scrubadub_spacy", install_from=OptionalDependency.PII) + def construct(cls, entity, model, **kwargs): + spacy_entity_detector = scrubadub_spacy.detectors.spacy.SpacyEntityDetector( + named_entities=[entity], + name=f"spacy_{uuid.uuid4()}", + model=model, + ) + if entity.upper() not in cls.DEFAULT_SPACY_NAMED_ENTITIES: + filth_cls = type( + construct_filth_cls_name(entity), + (scrubadub.filth.Filth,), + {"type": entity.upper()}, + ) + spacy_entity_detector.filth_cls_map[entity.upper()] = filth_cls + return spacy_entity_detector + + +class PiiDetectorFactory: + """ + The factory class helps to instantiate proper detector object based on the detector config. + """ + + _MAP = { + SupportedDetector.DEFAULT: BuiltInDetector, + SupportedDetector.SPACY: SpacyDetector, + } + + @classmethod + def get_detector( + cls, + detector_type, + entity, + model=None, + ): + if detector_type not in cls._MAP: + raise UnSupportedDetectorError(detector_type) + + return cls._MAP[detector_type].construct(entity=entity, model=model) diff --git a/ads/opctl/operator/lowcode/pii/model/guardrails.py b/ads/opctl/operator/lowcode/pii/model/guardrails.py new file mode 100644 index 000000000..41dc3514b --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/guardrails.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import time +from datetime import datetime + +from ads.common.object_storage_details import ObjectStorageDetails +from ads.opctl import logger +from ads.opctl.operator.lowcode.pii.constant import DataFrameColumn +from ads.opctl.operator.lowcode.pii.model.pii import PiiScrubber, detect, scrub +from ads.opctl.operator.lowcode.pii.model.report import ( + PIIOperatorReport, + PiiReportPageSpec, + PiiReportSpec, +) +from ads.opctl.operator.lowcode.pii.operator_config import PiiOperatorConfig +from ads.opctl.operator.lowcode.pii.utils import ( + _load_data, + _write_data, + default_signer, + get_output_name, +) + + +class PIIGuardrail: + def __init__(self, config: PiiOperatorConfig): + self.config = config + self.spec = config.spec + self.pii_scrubber = PiiScrubber(config=config) + self.scrubber = self.pii_scrubber.config_scrubber() + + output_filename = get_output_name( + target_name=self.spec.output_directory.name, + given_name=self.spec.input_data.url, + ) + self.dst_uri = os.path.join(self.spec.output_directory.url, output_filename) + self.config.spec.output_directory.name = output_filename + + self.report_uri = os.path.join( + self.spec.output_directory.url, + self.spec.report.report_filename, + ) + + self.report_context: PiiReportSpec = PiiReportSpec.from_dict( + { + "run_summary": { + "config": self.config, + "selected_detectors": self.pii_scrubber.detectors, + "selected_entities": self.pii_scrubber.entities, + "selected_spacy_model": self.pii_scrubber.spacy_model_detectors, + "show_rows": self.spec.report.show_rows, + "show_sensitive_info": self.spec.report.show_sensitive_content, + "src_uri": self.spec.input_data.url, + "total_tokens": 0, + }, + "run_details": {"rows": []}, + } + ) + + self.storage_options = ( + default_signer() + if ObjectStorageDetails.is_oci_path(self.spec.output_directory.url) + else {} + ) + self.datasets = None + + def load_data(self, uri=None, storage_options=None): + """Loads input data.""" + input_data_uri = uri or self.spec.input_data.url + logger.info(f"Loading input data from `{input_data_uri}` ...") + + self.datasets = _load_data( + filename=input_data_uri, + storage_options=storage_options or self.storage_options, + ) + return self + + def process(self, **kwargs): + """Process input data.""" + self.report_context.run_summary.timestamp = datetime.now().strftime( + "%d/%m/%Y %H:%M:%S" + ) + start_time = time.time() + + data = kwargs.pop("input_data", None) or self.datasets + report_uri = kwargs.pop("report_uri", None) or self.report_uri + dst_uri = kwargs.pop("dst_uri", None) or self.dst_uri + + if not data: + try: + self.load_data() + data = self.datasets + except Exception as e: + logger.warning( + f"Failed to load data from `{self.spec.input_data.url}`." + ) + raise e + + # process user data + data[DataFrameColumn.REDACTED_TEXT] = data[self.spec.target_column].apply( + lambda x: scrub(x, scrubber=self.scrubber) + ) + self.report_context.run_summary.elapsed_time = time.time() - start_time + self.report_context.run_summary.total_rows = len(data.index) + + # save output data + if dst_uri: + logger.info(f"Saving data into `{dst_uri}` ...") + + _write_data( + data=data.loc[:, data.columns != self.spec.target_column], + filename=dst_uri, + storage_options=kwargs.pop("storage_options", None) + or self.storage_options, + ) + + # prepare pii report + if report_uri: + logger.info(f"Generating report to `{report_uri}` ...") + + data[DataFrameColumn.ENTITIES] = data[self.spec.target_column].apply( + lambda x: detect(text=x, scrubber=self.scrubber) + ) + + for i in data.index: + text = data[self.spec.target_column][i] + ent_col = data[DataFrameColumn.ENTITIES][i] + page = PiiReportPageSpec.from_dict( + { + "id": i, + "total_tokens": len(ent_col), + "entities": ent_col, + "raw_text": text, + } + ) + self.report_context.run_details.rows.append(page) + self.report_context.run_summary.total_tokens += len(ent_col) + + self._process_context() + PIIOperatorReport( + report_spec=self.report_context, report_uri=report_uri + ).make_view().save_report( + storage_options=kwargs.pop("storage_options", None) + or self.storage_options + ) + + def _process_context(self): + """Count different type of filth.""" + statics = {} # statics : count Filth type in total + rows = self.report_context.run_details.rows + for row in rows: + entities = row.entities + row_statics = {} # count row + for ent in entities: + row_statics[ent.type] = row_statics.get(ent.type, 0) + 1 + statics[ent.type] = statics.get(ent.type, 0) + 1 + + row.statics = row_statics.copy() + + self.report_context.run_summary.statics = statics diff --git a/ads/opctl/operator/lowcode/pii/model/pii.py b/ads/opctl/operator/lowcode/pii/model/pii.py new file mode 100644 index 000000000..ba036d05e --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/pii.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.opctl import logger +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.lowcode.pii.model.factory import PiiDetectorFactory +from ads.opctl.operator.lowcode.pii.constant import ( + SupportedAction, + SupportedDetector, +) +from ads.opctl.operator.lowcode.pii.model.processor import ( + POSTPROCESSOR_MAP, + SUPPORTED_REPLACER, + Remover, +) + + +class PiiScrubber: + """Class used for config scrubber and count the detectors in use.""" + + @runtime_dependency(module="scrubadub", install_from=OptionalDependency.PII) + def __init__(self, config): + logger.info(f"Loading config from {config}") + if isinstance(config, str): + config = _load_yaml_from_uri(config) + + self.config = config + self.spec = ( + self.config["spec"] if isinstance(self.config, dict) else self.config.spec + ) + self.detector_spec = ( + self.spec["detectors"] + if isinstance(self.spec, dict) + else self.spec.detectors + ) + + self.scrubber = scrubadub.Scrubber() + + self.detectors = [] + self.entities = [] + self.spacy_model_detectors = [] + self.post_processors = {} + + self._reset_scrubber() + + def _reset_scrubber(self): + # Clean up default detectors + defautls_enable = self.scrubber._detectors.copy() + for d in defautls_enable: + self.scrubber.remove_detector(d) + + def _register(self, name, dtype, model, action, mask_with: str = None): + if action not in SupportedAction.values(): + raise ValueError( + f"Not supported `action`: {action}. Please select from {SupportedAction.values()}." + ) + + detector = PiiDetectorFactory.get_detector( + detector_type=dtype, entity=name, model=model + ) + self.scrubber.add_detector(detector) + self.entities.append(name) + + if action == SupportedAction.ANONYMIZE: + entity = ( + detector + if isinstance(detector, str) + else detector.filth_cls_map[name.upper()].type + ) + if entity in SUPPORTED_REPLACER.keys(): + replacer_name = SUPPORTED_REPLACER.get(entity).name + replacer = self.post_processors.get( + replacer_name, POSTPROCESSOR_MAP.get(replacer_name)() + ) + if hasattr(replacer, "_ENTITIES"): + replacer._ENTITIES.append(name) + self.post_processors[replacer_name] = replacer + else: + raise ValueError( + f"Not supported `action` {action} for this entity `{name}`. Please try with other action." + ) + + if action == SupportedAction.REMOVE: + remover = self.post_processors.get("remover", Remover()) + remover._ENTITIES.append(name) + self.post_processors["remover"] = remover + + def config_scrubber(self): + """Returns an instance of srubadub.Scrubber.""" + + self.scrubber.redact_spec_file = self.spec + + for detector in self.detector_spec: + # example format for detector["name"]: default.phone or spacy.en_core_web_trf.person + d = detector["name"].split(".") + dtype = d[0] + dname = d[1] if len(d) == 2 else d[2] + model = None if len(d) == 2 else d[1] + + action = detector.get("action", SupportedAction.MASK) + self._register( + name=dname, + dtype=dtype, + model=model, + action=action, + ) + if dtype == SupportedDetector.SPACY: + exist = False + for spacy_detectors in self.spacy_model_detectors: + if spacy_detectors["model"] == model: + spacy_detectors["spacy_entites"].append(dname) + exist = True + break + if not exist: + self.spacy_model_detectors.append( + {"model": model, "spacy_entites": [dname]} + ) + + self._register_post_processor() + + self.detectors = list(self.scrubber._detectors.values()) + return self.scrubber + + def _register_post_processor(self): + for _, v in self.post_processors.items(): + self.scrubber.add_post_processor(v) + + +def scrub(text, config=None, scrubber=None): + if not scrubber: + scrubber = PiiScrubber(config=config).config_scrubber() + return scrubber.clean(text) + + +def detect(text, config=None, scrubber=None): + if not scrubber: + scrubber = PiiScrubber(config=config).config_scrubber() + return list(scrubber.iter_filth(text, document_name=None)) diff --git a/ads/opctl/operator/lowcode/pii/model/processor/__init__.py b/ads/opctl/operator/lowcode/pii/model/processor/__init__.py new file mode 100644 index 000000000..062a61aa7 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/__init__.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from .email_replacer import EmailReplacer +from .mbi_replacer import MBIReplacer +from .name_replacer import NameReplacer +from .number_replacer import NumberReplacer +from .remover import Remover + +POSTPROCESSOR_MAP = { + item.name.lower(): item + for item in [ + NameReplacer, + NumberReplacer, + EmailReplacer, + MBIReplacer, + Remover, + ] +} + +# Currently only support anonymization for the following entity. +SUPPORTED_REPLACER = { + "name": NameReplacer, + "number": NumberReplacer, + "phone": NumberReplacer, + "social_security_number": NumberReplacer, + "fin": NumberReplacer, + "mrn": NumberReplacer, + "email": EmailReplacer, + "mbi": MBIReplacer, +} diff --git a/ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py b/ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py new file mode 100644 index 000000000..69a9d92ef --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class EmailReplacer(scrubadub.post_processors.PostProcessor): + name = "email_replacer" + + @runtime_dependency(module="faker", install_from=OptionalDependency.PII) + def process_filth(self, filth_list): + from faker import Faker + + for filth in filth_list: + if filth.replacement_string: + continue + if filth.type.lower() != "email": + continue + filth.replacement_string = Faker().email() + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py b/ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py new file mode 100644 index 000000000..013526cad --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import random +import string + +from ads.common.decorator.runtime_dependency import OptionalDependency + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class MBIReplacer(scrubadub.post_processors.PostProcessor): + name = "mbi_replacer" + CHAR_POOL = "ACDEFGHJKMNPQRTUVWXY" + + def generate_mbi(self): + return "".join(random.choices(self.CHAR_POOL + string.digits, k=11)) + + def process_filth(self, filth_list): + for filth in filth_list: + if filth.replacement_string: + continue + if filth.type.lower() != "mbi": + continue + filth.replacement_string = self.generate_mbi() + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py b/ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py new file mode 100644 index 000000000..2c7dde747 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class NameReplacer(scrubadub.post_processors.PostProcessor): + name = "name_replacer" + + @runtime_dependency(module="faker", install_from=OptionalDependency.PII) + @runtime_dependency(module="gender_guesser", install_from=OptionalDependency.PII) + def __init__(self, name: str = None, mapping: dict = None): + import gender_guesser.detector as gender_detector + from faker import Faker + + if mapping: + self.mapping = mapping + else: + self.mapping = {} + + self.gender_detector = gender_detector.Detector() + self.fake = Faker() + self.groups = { + "first": self.first_name_generator, + "middle": self.first_name_generator, + "last": self.last_name_generator, + "suffix": lambda x: "", + } + super().__init__(name) + + def first_name_generator(self, name): + detected_gender = self.gender_detector.get_gender(name) + if "female" in detected_gender: + return self.fake.first_name_female() + elif "male" in detected_gender: + return self.fake.first_name_male() + return self.fake.first_name_nonbinary() + + def last_name_generator(self, *args): + return self.fake.last_name() + + def unwrap_filth(self, filth_list): + """Un-merge the filths if they have different types.""" + processed = [] + for filth in filth_list: + # MergedFilths has the property "filths" + # Do nothing if filth has a type already + if filth.type in ["unknown", "", None] and hasattr(filth, "filths"): + filth_types = set([f.type.lower() for f in filth.filths]) + # Do nothing if the filth does not contain a name + if "name" not in filth_types: + processed.append(filth) + continue + if len(filth_types) > 1: + processed.extend(filth.filths) + continue + filth.type = filth.filths[0].type + filth.detector_name = filth.filths[0].detector_name + processed.append(filth) + return processed + + @staticmethod + def has_initial(name: "nameparser.HumanName") -> bool: + for attr in ["first", "middle", "last"]: + if len(str(getattr(name, attr)).strip(".")) == 1: + return True + return False + + @staticmethod + def has_non_initial(name: "nameparser.HumanName") -> bool: + for attr in ["first", "middle", "last"]: + if len(str(getattr(name, attr)).strip(".")) > 1: + return True + return False + + @staticmethod + def generate_component(name_component: str, generator): + fake_component = generator(name_component) + if len(name_component.rstrip(".")) == 1: + fake_component = fake_component[0] + if name_component.endswith("."): + fake_component += "." + return fake_component + + def save_name_mapping( + self, name: "nameparser.HumanName", fake_name: "nameparser.HumanName" + ): + """Saves the names with initials to the mapping so that a new name will not be generated. + For example, if name is "John Richard Doe", this method will save the following keys to the mapping: + - J Doe + - John D + - J R Doe + - John R D + - John R Doe + """ + # Both first name and last name must be presented + if not name.first or not name.last: + return + # Remove any dot at the end of the name component. + for attr in ["first", "middle", "last"]: + setattr(name, attr, getattr(name, attr).rstrip(".")) + + self.mapping[ + f"{name.first[0]} {name.last}" + ] = f"{fake_name.first[0]} {fake_name.last}" + + self.mapping[ + f"{name.first} {name.last[0]}" + ] = f"{fake_name.first} {fake_name.last[0]}" + + if name.middle: + self.mapping[ + f"{name.first[0]} {name.middle[0]} {name.last}" + ] = f"{fake_name.first[0]} {fake_name.middle[0]} {fake_name.last}" + + self.mapping[ + f"{name.first} {name.middle[0]} {name.last[0]}" + ] = f"{fake_name.first} {fake_name.middle[0]} {fake_name.last[0]}" + + self.mapping[ + f"{name.first} {name.middle[0]} {name.last}" + ] = f"{fake_name.first} {fake_name.middle[0]} {fake_name.last}" + + @runtime_dependency(module="nameparser", install_from=OptionalDependency.PII) + def replace(self, text): + """Replaces a name with fake name. + + Parameters + ---------- + text : str or HumanName + The name to be replaced. + If text is a HumanName object, the object will be modified to have the new fake names. + + Returns + ------- + str + The replaced name as text. + """ + from nameparser import HumanName + + if isinstance(text, HumanName): + name = text + else: + name = HumanName(text) + skip = [] + # Check if the name is given with initial for one of the first name/last name + key = None + if self.has_initial(name) and self.has_non_initial(name): + if name.middle: + key = f'{name.first.rstrip(".")} {name.middle.rstrip(".")} {name.last.rstrip(".")}' + else: + key = f'{name.first.rstrip(".")} {name.last.rstrip(".")}' + fake_name = self.mapping.get(key) + # If a fake name is found matching the first initial + last name or first name + last initial + # Replace the the initial with the corresponding initial + # and skip processing the first and last name in the replacement. + if fake_name: + fake_name = HumanName(fake_name) + name.first = fake_name.first + name.last = fake_name.last + skip = ["first", "last"] + if name.middle: + name.middle = fake_name.middle + skip.append("middle") + # Replace each component in the name + for attr, generator in self.groups.items(): + if attr in skip: + continue + name_component = getattr(name, attr, None) + if not name_component: + continue + # Check if a fake name has been generated for this name + fake_component = self.mapping.get(name_component) + if not fake_component: + fake_component = self.generate_component(name_component, generator) + # Generate a unique fake name that is not already in the mapping + while fake_component and ( + fake_component in self.mapping.keys() + or fake_component in self.mapping.values() + ): + fake_component = self.generate_component(name_component, generator) + self.mapping[name_component] = fake_component + setattr(name, attr, fake_component) + + # Save name with initials to mapping + original_name = text if isinstance(text, HumanName) else HumanName(text) + self.save_name_mapping(original_name, name) + return str(name) + + @runtime_dependency(module="nameparser", install_from=OptionalDependency.PII) + def process_filth(self, filth_list): + from nameparser import HumanName + + filth_list = self.unwrap_filth(filth_list) + + name_filths = [] + # Filter to keep only the names + for filth in filth_list: + if filth.replacement_string: + continue + if filth.type.lower() != "name": + continue + name_filths.append(filth) + + # Sort reverse by last name so that names having a last name will be processed first. + # When a name is referred by last name (e.g. Mr. White), HumanName will parse it as first name. + name_filths.sort(key=lambda x: HumanName(x.text).last, reverse=True) + for filth in name_filths: + filth.replacement_string = self.replace(filth.text) + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py b/ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py new file mode 100644 index 000000000..5bf678991 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import datetime +import random +import re + +from ads.common.decorator.runtime_dependency import OptionalDependency + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class NumberReplacer(scrubadub.post_processors.PostProcessor): + name = "number_replacer" + _ENTITIES = [ + "number", + "mrn", + "fin", + "phone", + "social_security_number", + ] + + @staticmethod + def replace_digit(obj): + return random.choice("0123456789") + + def match_entity_type(self, filth_types): + if list(set(self._ENTITIES) & set(filth_types)): + return True + return False + + def replace_date(self, text): + date_formats = ["%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y"] + for date_format in date_formats: + try: + date = datetime.datetime.strptime(text, date_format) + except ValueError: + continue + if date.year < 1900 or date.year > datetime.datetime.now().year: + continue + # Now the date is a valid data between 1900 and now + return text + return None + + def replace(self, text): + # Check dates + date = self.replace_date(text) + if date: + return date + return re.sub(r"\d", self.replace_digit, text) + + def process_filth(self, filth_list): + for filth in filth_list: + # Do not process it if it already has a replacement. + if filth.replacement_string: + continue + if filth.type.lower() in self._ENTITIES: + filth.replacement_string = self.replace(filth.text) + # Replace the numbers for merged filth + if filth.type.lower() == "unknown" and hasattr(filth, "filths"): + filth_types = set([f.type for f in filth.filths]) + if self.match_entity_type(filth_types): + filth.replacement_string = self.replace(filth.text) + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/processor/remover.py b/ads/opctl/operator/lowcode/pii/model/processor/remover.py new file mode 100644 index 000000000..0e014fe80 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/remover.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from ads.common.decorator.runtime_dependency import OptionalDependency + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class Remover(scrubadub.post_processors.PostProcessor): + name = "remover" + _ENTITIES = [] + + def process_filth(self, filth_list): + for filth in filth_list: + if filth.type.lower() in self._ENTITIES: + filth.replacement_string = "" + + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/report.py b/ads/opctl/operator/lowcode/pii/model/report.py new file mode 100644 index 000000000..42167ba87 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/report.py @@ -0,0 +1,489 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +import os +import random +import tempfile +from dataclasses import dataclass, field +from typing import Dict, List + +import fsspec +import pandas as pd +import requests +import yaml + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.common.serializer import DataClassSerializable +from ads.opctl import logger +from ads.opctl.operator.lowcode.pii.constant import ( + DEFAULT_COLOR, + DEFAULT_SHOW_ROWS, + DEFAULT_TIME_OUT, + DETAILS_REPORT_DESCRIPTION, + FLAT_UI_COLORS, + PII_REPORT_DESCRIPTION, +) +from ads.opctl.operator.lowcode.pii.operator_config import PiiOperatorConfig +from ads.opctl.operator.lowcode.pii.utils import ( + block_print, + compute_rate, + enable_print, + human_time_friendly, +) + +try: + import datapane as dp +except ImportError: + raise ModuleNotFoundError( + f"`datapane` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +@dataclass(repr=True) +class PiiReportPageSpec(DataClassSerializable): + """Class representing each page under Run Details in pii operator report.""" + + entities: list = field(default_factory=list) + id: int = None + raw_text: str = None + statics: dict = field(default_factory=dict) + total_tokens: int = None + + +@dataclass(repr=True) +class RunDetails(DataClassSerializable): + """Class representing Run Details Page in pii operator report.""" + + rows: list = field(default_factory=list) + + +@dataclass(repr=True) +class RunSummary(DataClassSerializable): + """Class representing Run Summary Page in pii operator report.""" + + config: PiiOperatorConfig = None + elapsed_time: str = None + selected_detectors: list = field(default_factory=list) + selected_entities: List[str] = field(default_factory=list) + selected_spacy_model: List[Dict] = field(default_factory=list) + show_rows: int = None + show_sensitive_info: bool = False + src_uri: str = None + statics: dict = None + timestamp: str = None + total_rows: int = None + total_tokens: int = None + + +@dataclass(repr=True) +class PiiReportSpec(DataClassSerializable): + """Class representing pii operator report.""" + + run_details: RunDetails = field(default_factory=RunDetails) + run_summary: RunSummary = field(default_factory=RunSummary) + + +LABEL_TO_COLOR_MAP = {} + + +@runtime_dependency(module="plotly", install_from=OptionalDependency.PII) +def make_model_card(model_name="", readme_path=""): + """Make render model_readme.md as model_card tab. + All spacy model: https://huggingface.co/spacy + For example: "en_core_web_trf": "https://huggingface.co/spacy/en_core_web_trf/raw/main/README.md". + """ + + readme_path = ( + f"https://huggingface.co/spacy/{model_name}/raw/main/README.md" + if model_name + else readme_path + ) + if not readme_path: + raise NotImplementedError("Does not support other spacy model so far.") + + try: + requests.get(readme_path, timeout=DEFAULT_TIME_OUT) + with fsspec.open(readme_path, "r") as file: + content = file.read() + _, front_matter, text = content.split("---", 2) + data = yaml.safe_load(front_matter) + except requests.ConnectionError: + logger.warning( + "You don't have internet connection. Therefore, we are not able to generate model card." + ) + return dp.Group( + dp.Text("-"), + columns=1, + ) + + try: + import plotly.graph_objects as go + + eval_res = data["model-index"][0]["results"] + metrics = [] + values = [] + for eval in eval_res: + metric = [x["name"] for x in eval["metrics"]] + value = [x["value"] for x in eval["metrics"]] + metrics = metrics + metric + values = values + value + df = pd.DataFrame({"Metrics": metrics, "Values": values}) + fig = go.Figure( + data=[ + go.Table( + header=dict(values=list(df.columns)), + cells=dict(values=[df.Metrics, df.Values]), + ) + ] + ) + eval_res_tb = dp.Plot(data=fig, caption="Evaluation Results") + except: + eval_res_tb = dp.Text("-") + logger.warning( + "The given readme.md doesn't have correct template for Evaluation Results." + ) + + return dp.Group( + dp.Text(text), + eval_res_tb, + columns=2, + ) + + +def map_label_to_color(labels): + """Pair label with corresponding color.""" + label_to_colors = {} + for label in labels: + label = label.lower() + label_to_colors[label] = LABEL_TO_COLOR_MAP.get( + label, random.choice(FLAT_UI_COLORS) + ) + LABEL_TO_COLOR_MAP[label] = label_to_colors[label] + + return label_to_colors + + +@runtime_dependency(module="plotly", install_from=OptionalDependency.PII) +def plot_pie(count_map) -> dp.Plot: + import plotly.express as px + + cols = count_map.keys() + cnts = count_map.values() + ent_col_name = "EntityName" + cnt_col_name = "count" + df = pd.DataFrame({ent_col_name: cols, cnt_col_name: cnts}) + + fig = px.pie( + df, + values=cnt_col_name, + names=ent_col_name, + title="The Distribution Of Entities Redacted", + color=ent_col_name, + color_discrete_map=map_label_to_color(cols), + ) + fig.update_traces(textposition="inside", textinfo="percent+label") + return dp.Plot(fig) + + +def build_entity_df(entites, id) -> pd.DataFrame: + text = [ent.text for ent in entites] + types = [ent.type for ent in entites] + replaced_values = [ + ent.replacement_string or "{{" + ent.placeholder + "}}" for ent in entites + ] + d = { + "Row ID": id, + "Entity (Original Text)": text, + "Type": types, + "Redacted To": replaced_values, + } + df = pd.DataFrame(data=d) + if df.size == 0: + # Datapane does not support empty dataframe, append a dummy row + df2 = { + "Row ID": id, + "Entity (Original Text)": "-", + "Type": "-", + "Redacted To": "-", + } + df = df.append(df2, ignore_index=True) + return df + + +class RowReportFields: + def __init__(self, row_spec: PiiReportPageSpec, show_sensitive_info: bool = True): + self.spec = row_spec + self.show_sensitive_info = show_sensitive_info + + def build_report(self) -> dp.Group: + return dp.Group( + dp.Select( + blocks=[ + self._make_stats_card(), + self._make_text_card(), + ], + type=dp.SelectType.TABS, + ), + label="Row Id: " + str(self.spec.id), + ) + + def _make_stats_card(self): + stats = [ + dp.Text("## Row Summary Statistics"), + dp.BigNumber( + heading="Total No. Of Entites Proceed", + value=self.spec.total_tokens or 0, + ), + dp.Text(f"### Entities Distribution"), + plot_pie(self.spec.statics), + ] + if self.show_sensitive_info: + stats.append(dp.Text(f"### Resolved Entities")) + stats.append( + dp.DataTable( + build_entity_df(self.spec.entities, id=self.spec.id), + label="Resolved Entities", + ) + ) + return dp.Group(blocks=stats, label="STATS") + + def _make_text_card(self): + annotations = [] + labels = set() + for ent in self.spec.entities: + annotations.append((ent.beg, ent.end, ent.type)) + labels.add(ent.type) + + if len(annotations) == 0: + annotations.append((0, 0, "No entity detected")) + + d = {"Content": [self.spec.raw_text], "Annotations": [annotations]} + df = pd.DataFrame(data=d) + render_html = df.ads.render_ner( + options={ + "default_color": DEFAULT_COLOR, + "colors": map_label_to_color(labels), + }, + return_html=True, + ) + return dp.Group(dp.HTML(render_html), label="TEXT") + + +class PIIOperatorReport: + def __init__(self, report_spec: PiiReportSpec, report_uri: str): + # set useful field for generating report from context + self.report_spec = report_spec + self.show_rows = report_spec.run_summary.show_rows or DEFAULT_SHOW_ROWS + + rows = report_spec.run_details.rows + rows = rows[0 : self.show_rows] + self.rows_details = [ + RowReportFields(r, report_spec.run_summary.show_sensitive_info) + for r in rows + ] + + self.report_uri = report_uri + + def make_view(self): + title_text = dp.Text("# Personally Identifiable Information Operator Report") + time_proceed = dp.BigNumber( + heading="Ran at", + value=self.report_spec.run_summary.timestamp or "today", + ) + report_description = dp.Text(PII_REPORT_DESCRIPTION) + + structure = dp.Blocks( + dp.Select( + blocks=[ + dp.Group( + self._build_summary_page(), + label="Summary", + ), + dp.Group( + self._build_details_page(), + label="Details", + ), + ], + type=dp.SelectType.TABS, + ) + ) + self.report_sections = [title_text, report_description, time_proceed, structure] + return self + + def save_report(self, report_sections=None, report_uri=None, storage_options={}): + with tempfile.TemporaryDirectory() as temp_dir: + report_local_path = os.path.join(temp_dir, "___report.html") + block_print() + dp.save_report( + report_sections or self.report_sections, + path=report_local_path, + open=False, + ) + enable_print() + + report_uri = report_uri or self.report_uri + with open(report_local_path) as f1: + with fsspec.open( + report_uri, + "w", + **storage_options, + ) as f2: + f2.write(f1.read()) + + def _build_summary_page(self): + summary = dp.Blocks( + dp.Text("# PII Summary"), + dp.Text(self._get_summary_desc()), + dp.Select( + blocks=[ + self._make_summary_stats_card(), + self._make_yaml_card(), + self._make_model_card(), + ], + type=dp.SelectType.TABS, + ), + ) + + return summary + + def _build_details_page(self): + details = dp.Blocks( + dp.Text(DETAILS_REPORT_DESCRIPTION), + dp.Select( + blocks=[ + row.build_report() for row in self.rows_details + ], # RowReportFields + type=dp.SelectType.DROPDOWN, + label="Details", + ), + ) + + return details + + def _make_summary_stats_card(self) -> dp.Group: + """ + Shows summary statics + 1. total rows + 2. total entites + 3. time_spent/row + 4. entities distribution + 5. resolved Entities in sample data - optional + """ + try: + process_rate = compute_rate( + self.report_spec.run_summary.elapsed_time, + self.report_spec.run_summary.total_rows, + ) + except Exception as e: + logger.warning("Failed to compute processing rate.") + logger.debug(f"Full traceback: {e}") + process_rate = "-" + + summary_stats = [ + dp.Text("## Summary Statistics"), + dp.Group( + dp.BigNumber( + heading="Total No. Of Rows", + value=self.report_spec.run_summary.total_rows or "unknown", + ), + dp.BigNumber( + heading="Total No. Of Entites Proceed", + value=self.report_spec.run_summary.total_tokens, + ), + dp.BigNumber( + heading="Rows per second processed", + value=process_rate, + ), + dp.BigNumber( + heading="Total Time Spent", + value=human_time_friendly( + self.report_spec.run_summary.elapsed_time + ), + ), + columns=2, + ), + dp.Text(f"### Entities Distribution"), + plot_pie(self.report_spec.run_summary.statics), + ] + if self.report_spec.run_summary.show_sensitive_info: + entites_df = self._build_total_entity_df() + summary_stats.append(dp.Text(f"### Resolved Entities")) + summary_stats.append(dp.DataTable(entites_df)) + return dp.Group(blocks=summary_stats, label="STATS") + + def _make_yaml_card(self) -> dp.Group: + """Shows the full pii config yaml.""" + yaml_string = self.report_spec.run_summary.config.to_yaml() + yaml_appendix_title = dp.Text(f"## Reference: YAML File") + yaml_appendix = dp.Code(code=yaml_string, language="yaml") + return dp.Group(blocks=[yaml_appendix_title, yaml_appendix], label="YAML") + + def _make_model_card(self) -> dp.Group: + """Generates model card.""" + if len(self.report_spec.run_summary.selected_spacy_model) == 0: + return dp.Group( + dp.Text("No model used."), + label="MODEL CARD", + ) + + model_cards = [ + dp.Group( + make_model_card(model_name=x.get("model")), + label=x.get("model"), + ) + for x in self.report_spec.run_summary.selected_spacy_model + ] + + if len(model_cards) <= 1: + return dp.Group( + blocks=model_cards, + label="MODEL CARD", + ) + return dp.Group( + dp.Select( + blocks=model_cards, + type=dp.SelectType.TABS, + ), + label="MODEL CARD", + ) + + def _build_total_entity_df(self) -> pd.DataFrame: + frames = [] + for row in self.rows_details: # RowReportFields + frames.append(build_entity_df(entites=row.spec.entities, id=row.spec.id)) + + result = pd.concat(frames) + return result + + def _get_summary_desc(self) -> str: + entities_mark_down = [ + "**" + ent + "**" for ent in self.report_spec.run_summary.selected_entities + ] + + model_description = "" + for spacy_model in self.report_spec.run_summary.selected_spacy_model: + model_description = ( + model_description + + f"You chose the **{spacy_model.get('model', 'unknown model')}** model for **{spacy_model.get('spacy_entites', 'unknown entities')}** detection." + ) + if model_description: + model_description = ( + model_description + + "You can view the model details under the ``MODEL CARD`` tab." + ) + + SUMMARY_REPORT_DESCRIPTION_TEMPLATE = f""" + This report will detail the statistics and configuration of the redaction process.The report will contain information such as the number of rows processed, the number of entities redacted, and so on. The report will provide valuable insight into the performance of the PII tool and facilitate any necessary adjustments to improve its performance. + + Based on the configuration file (you can view the YAML details under the ``YAML`` tab), you selected the following entities: {entities_mark_down}. + {model_description} + """ + return SUMMARY_REPORT_DESCRIPTION_TEMPLATE diff --git a/ads/opctl/operator/lowcode/pii/operator_config.py b/ads/opctl/operator/lowcode/pii/operator_config.py index 1ab8bf96c..d70e8770b 100644 --- a/ads/opctl/operator/lowcode/pii/operator_config.py +++ b/ads/opctl/operator/lowcode/pii/operator_config.py @@ -9,69 +9,90 @@ from typing import Dict, List from ads.common.serializer import DataClassSerializable -from ads.opctl.operator.common.utils import _load_yaml_from_uri from ads.opctl.operator.common.operator_config import OperatorConfig +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.lowcode.pii.constant import ( + DEFAULT_SHOW_ROWS, + DEFAULT_REPORT_FILENAME, + DEFAULT_TARGET_COLUMN, +) @dataclass(repr=True) class InputData(DataClassSerializable): """Class representing operator specification input data details.""" - format: str = None - columns: List[str] = None url: str = None - options: Dict = None - limit: int = None @dataclass(repr=True) class OutputDirectory(DataClassSerializable): """Class representing operator specification output directory details.""" - connect_args: Dict = None - format: str = None url: str = None name: str = None - options: Dict = None @dataclass(repr=True) -class PIIOperatorSpec(DataClassSerializable): - """Class representing PII operator specification.""" +class Report(DataClassSerializable): + """Class representing operator specification report details.""" + + report_filename: str = None + show_rows: int = None + show_sensitive_content: bool = False + + +@dataclass(repr=True) +class Detector(DataClassSerializable): + """Class representing operator specification redactor directory details.""" name: str = None + action: str = None + + +@dataclass(repr=True) +class PiiOperatorSpec(DataClassSerializable): + """Class representing pii operator specification.""" + input_data: InputData = field(default_factory=InputData) output_directory: OutputDirectory = field(default_factory=OutputDirectory) - report_file_name: str = None - report_title: str = None - report_theme: str = None + report: Report = field(default_factory=Report) + target_column: str = None + detectors: List[Dict] = field(default_factory=list) def __post_init__(self): """Adjusts the specification details.""" - self.report_file_name = self.report_file_name or "report.html" - self.report_theme = self.report_theme or "light" + + self.target_column = self.target_column or DEFAULT_TARGET_COLUMN + self.report = self.report or Report.from_dict( + { + "report_filename": DEFAULT_REPORT_FILENAME, + "show_rows": DEFAULT_SHOW_ROWS, + "show_sensitive_content": False, + } + ) @dataclass(repr=True) -class PIIOperatorConfig(OperatorConfig): - """Class representing PII operator config. +class PiiOperatorConfig(OperatorConfig): + """Class representing pii operator config. Attributes ---------- kind: str The kind of the resource. For operators it is always - `operator`. type: str - The type of the operator. For PII operator it is always - `PII` + The type of the operator. For pii operator it is always - `pii` version: str The version of the operator. - spec: PIIOperatorSpec - The PII operator specification. + spec: PiiOperatorSpec + The pii operator specification. """ kind: str = "operator" - type: str = "PII" + type: str = "pii" version: str = "v1" - spec: PIIOperatorSpec = field(default_factory=PIIOperatorSpec) + spec: PiiOperatorSpec = field(default_factory=PiiOperatorSpec) @classmethod def _load_schema(cls) -> str: diff --git a/ads/opctl/operator/lowcode/pii/schema.yaml b/ads/opctl/operator/lowcode/pii/schema.yaml index 1fe39b258..ff295c7fa 100644 --- a/ads/opctl/operator/lowcode/pii/schema.yaml +++ b/ads/opctl/operator/lowcode/pii/schema.yaml @@ -4,6 +4,8 @@ kind: required: true type: string default: operator + meta: + description: "Which service are you trying to use? Common kinds: `operator`, `job`" version: allowed: @@ -19,84 +21,28 @@ type: type: string default: pii meta: - description: "Type should always be `pii` when using a PII operator" + description: "Type should always be `pii` when using a pii operator" + spec: required: true schema: - name: - required: false - type: string - default: PII - report_file_name: - required: false - type: string - default: report.html - meta: - description: "Placed into output_directory location. Defaults to report.html" - report_title: - required: false - type: string - default: PII Report - report_theme: - required: false - type: string - default: light - allowed: - - light - - dark input_data: required: true type: dict meta: - description: "The input data for the PII." + description: "This should be indexed by target column." schema: - format: - allowed: - - csv - - json - - clipboard - - excel - - hdf - - feather - - load_files - required: false - type: string - columns: - required: false - type: list - schema: - type: string - options: - nullable: true - required: false - type: dict url: required: true type: string default: data.csv meta: description: "The url can be local, or remote. For example: `oci://@/data.csv`" - limit: - required: false - type: integer + output_directory: - required: false + required: true schema: - connect_args: - nullable: true - required: false - type: dict - format: - required: false - type: string - allowed: - - csv - - json - - clipboard - - excel - - hdf - - sql url: required: true type: string @@ -106,9 +52,57 @@ spec: name: required: false type: string - options: - nullable: true + default: data-out.csv + type: dict + + report: + required: false + schema: + report_filename: + required: true + type: string + default: report.html + meta: + description: "Placed into `output_directory` location. Defaults to `report.html`" + show_rows: required: false - type: dict + type: number + meta: + description: "The number of rows that shows in the report. Defaults to `10`" + show_sensitive_content: + required: true + default: false + type: boolean + meta: + description: "Whether to show sensitive content in the report. Defaults to `False`" type: dict + + target_column: + type: string + required: true + default: target + meta: + description: "Column with user data." + + detectors: + type: list + required: true + schema: + type: dict + schema: + name: + required: true + type: string + meta: + description: "The name of the detector. THe format is `.`." + action: + required: true + type: string + default: mask + allowed: + - anonymize + - mask + - remove + meta: + description: "The way to process the detected entity. Default to `mask`." type: dict diff --git a/ads/opctl/operator/lowcode/pii/utils.py b/ads/opctl/operator/lowcode/pii/utils.py index b8d0460f5..50f28eed9 100644 --- a/ads/opctl/operator/lowcode/pii/utils.py +++ b/ads/opctl/operator/lowcode/pii/utils.py @@ -3,3 +3,123 @@ # Copyright (c) 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import sys + +import fsspec +import pandas as pd + +from ads.common.object_storage_details import ObjectStorageDetails + +from .errors import PIIInputDataError + + +def default_signer(**kwargs): + os.environ["EXTRA_USER_AGENT_INFO"] = "Pii-Operator" + from ads.common.auth import default_signer + + return default_signer(**kwargs) + + +def _call_pandas_fsspec(pd_fn, filename, storage_options, **kwargs): + if fsspec.utils.get_protocol(filename) == "file": + return pd_fn(filename, **kwargs) + + storage_options = storage_options or ( + default_signer() if ObjectStorageDetails.is_oci_path(filename) else {} + ) + + return pd_fn(filename, storage_options=storage_options, **kwargs) + + +def _load_data(filename, format=None, storage_options=None, columns=None, **kwargs): + if not format: + _, format = os.path.splitext(filename) + format = format[1:] + if format in ["json", "csv"]: + read_fn = getattr(pd, f"read_{format}") + data = _call_pandas_fsspec(read_fn, filename, storage_options=storage_options) + elif format in ["tsv"]: + data = _call_pandas_fsspec( + pd.read_csv, filename, storage_options=storage_options, sep="\t" + ) + else: + raise PIIInputDataError(f"Unrecognized format: {format}") + if columns: + # keep only these columns, done after load because only CSV supports stream filtering + data = data[columns] + return data + + +def _write_data( + data, filename, format=None, storage_options=None, index=False, **kwargs +): + if not format: + _, format = os.path.splitext(filename) + format = format[1:] + if format in ["json", "csv"]: + write_fn = getattr(data, f"to_{format}") + return _call_pandas_fsspec( + write_fn, filename, index=index, storage_options=storage_options + ) + raise PIIInputDataError(f"Unrecognized format: {format}") + + +def get_output_name(given_name, target_name=None): + """Add ``-out`` suffix to the src filename.""" + if not target_name: + basename = os.path.basename(given_name) + fn, ext = os.path.splitext(basename) + target_name = fn + "_out" + ext + return target_name + + +def construct_filth_cls_name(name: str) -> str: + """Constructs the filth class name from the given name. + For example, "name" -> "NameFilth". + + Args: + name (str): filth class name. + + Returns: + str: The filth class name. + """ + return "".join([s.capitalize() for s in name.split("_")]) + "Filth" + + +################ +# Report utils # +################ +def compute_rate(elapsed_time, num_unit): + return elapsed_time / num_unit + + +def human_time_friendly(seconds): + TIME_DURATION_UNITS = ( + ("week", 60 * 60 * 24 * 7), + ("day", 60 * 60 * 24), + ("hour", 60 * 60), + ("min", 60), + ) + if seconds == 0: + return "inf" + accumulator = [] + for unit, div in TIME_DURATION_UNITS: + amount, seconds = divmod(float(seconds), div) + if amount > 0: + accumulator.append( + "{} {}{}".format(int(amount), unit, "" if amount == 1 else "s") + ) + accumulator.append("{} secs".format(round(seconds, 2))) + return ", ".join(accumulator) + + +# Disable +def block_print(): + sys.stdout = open(os.devnull, "w") + + +# Restore +def enable_print(): + sys.stdout = sys.__stdout__ diff --git a/dev-requirements.txt b/dev-requirements.txt index 2244c5951..038d2bfe2 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ -r test-requirements.txt --e ".[bds,data,geo,huggingface,notebook,onnx,opctl,optuna,spark,tensorflow,text,torch,viz,forecast]" +-e ".[bds,data,geo,huggingface,notebook,onnx,opctl,optuna,spark,tensorflow,text,torch,viz,forecast,pii]" arff category_encoders dask diff --git a/docs/source/index.rst b/docs/source/index.rst index 0aee74570..ca4e6b4d2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -43,6 +43,7 @@ Oracle Accelerated Data Science (ADS) user_guide/operators/index user_guide/operators/common/index user_guide/operators/forecasting_operator/index + user_guide/operators/pii_operator/index .. toctree:: :hidden: diff --git a/docs/source/user_guide/operators/common/yaml_schema/piiOperator.yaml b/docs/source/user_guide/operators/common/yaml_schema/piiOperator.yaml new file mode 100644 index 000000000..ff295c7fa --- /dev/null +++ b/docs/source/user_guide/operators/common/yaml_schema/piiOperator.yaml @@ -0,0 +1,108 @@ +kind: + allowed: + - operator + required: true + type: string + default: operator + meta: + description: "Which service are you trying to use? Common kinds: `operator`, `job`" + +version: + allowed: + - "v1" + required: true + type: string + default: v1 + meta: + description: "Operators may change yaml file schemas from version to version, as well as implementation details. Double check the version to ensure compatibility." + +type: + required: true + type: string + default: pii + meta: + description: "Type should always be `pii` when using a pii operator" + + +spec: + required: true + schema: + input_data: + required: true + type: dict + meta: + description: "This should be indexed by target column." + schema: + url: + required: true + type: string + default: data.csv + meta: + description: "The url can be local, or remote. For example: `oci://@/data.csv`" + + output_directory: + required: true + schema: + url: + required: true + type: string + default: result/ + meta: + description: "The url can be local, or remote. For example: `oci://@/`" + name: + required: false + type: string + default: data-out.csv + type: dict + + report: + required: false + schema: + report_filename: + required: true + type: string + default: report.html + meta: + description: "Placed into `output_directory` location. Defaults to `report.html`" + show_rows: + required: false + type: number + meta: + description: "The number of rows that shows in the report. Defaults to `10`" + show_sensitive_content: + required: true + default: false + type: boolean + meta: + description: "Whether to show sensitive content in the report. Defaults to `False`" + type: dict + + target_column: + type: string + required: true + default: target + meta: + description: "Column with user data." + + detectors: + type: list + required: true + schema: + type: dict + schema: + name: + required: true + type: string + meta: + description: "The name of the detector. THe format is `.`." + action: + required: true + type: string + default: mask + allowed: + - anonymize + - mask + - remove + meta: + description: "The way to process the detected entity. Default to `mask`." + type: dict diff --git a/docs/source/user_guide/operators/pii_operator/examples.rst b/docs/source/user_guide/operators/pii_operator/examples.rst new file mode 100644 index 000000000..037bee176 --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/examples.rst @@ -0,0 +1,53 @@ +======== +Examples +======== + +**Simple Example** + +The simplest yaml file is generated by the ``ads operator init --type pii`` and looks like the following: + +.. code-block:: yaml + + kind: operator + type: pii + version: v1 + spec: + input_data: + url: mydata.csv + target_column: target + output_directory: + url: result/ + detectors: + - name: default.phone + action: mask + + + +**Complex Example** + +The yaml can also be maximally stated as follows: + + +.. code-block:: yaml + + kind: operator + type: pii + version: v1 + spec: + output_directory: + url: oci://my-bucket@my-tenancy/results + name: myProcessedData.csv + report: + report_filename: report.html + show_rows: 10 + show_sensitive_content: true + input_data: + url: oci://my-bucket@my-tenancy/mydata.csv + target_column: target + detectors: + - name: default.phone + action: mask + - name: default.social_security_number + action: remove + - name: spacy.en_core_web_trf.person + action: anonymize diff --git a/docs/source/user_guide/operators/pii_operator/getting_started.rst b/docs/source/user_guide/operators/pii_operator/getting_started.rst new file mode 100644 index 000000000..a8c455ded --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/getting_started.rst @@ -0,0 +1,64 @@ +=============== +Getting Started +=============== + +Configure +--------- + +After having set up ``ads opctl`` on your desired machine using ``ads opctl configure``, you are ready to begin using pii operator. At a bare minimum, you will need to provide the following details about your tasks: + +- Path to the input data (input_data) +- Path to the output directory, where the operator will place the processed data and report.html produced from the run (output_directory) +- Name of the column with user data (target_column) +- Name of the detector will be used in the operator (detectors) + +These details exactly match the initial pii.yaml file generated by running ``ads operator init --type pii``: + +.. code-block:: yaml + + kind: operator + type: pii + version: v1 + spec: + input_data: + url: mydata.csv + target_column: target + output_directory: + url: result/ + detectors: + - name: default.phone + action: mask + + +Optionally, you are able to specify much more. The most common additions are: + +- Whether to show sensitive content in the report. (show_sensitive_content) +- Way to process the detected entity. (action) + +An extensive list of parameters can be found in the ``YAML Schema`` section. + + +Run +--- + +After you have your pii.yaml written, you simply run the operator using: + +.. code-block:: bash + + ads operator run -f pii.yaml + + +Interpret Results +----------------- + +The pii operator produces the following output files: ``mydata-out.csv`` and ``report.html``. + +We will go through each of these output files in turn. + +**mydata-out.csv** + +The name of this file can be customized based on output_directory parameters in the configuration yaml. This file contains the processed dataset. + +**report.html** + +The report.html file is customized based on report parameters in the configuration yaml. It contains a summary of statistics, a plot of entities distributions, details of the resolved entites, and details about any modelused. By default sensitive information is not shown in the report, but for debugging purposes you can disable this with ``show_sensitive_content``. It also includes a copy of YAML file, providing a fully detailed version of the original specification. diff --git a/docs/source/user_guide/operators/pii_operator/index.rst b/docs/source/user_guide/operators/pii_operator/index.rst new file mode 100644 index 000000000..cdf5d962b --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/index.rst @@ -0,0 +1,37 @@ +============ +PII Operator +============ + +The PII operator aims to detect and redact Personally Identifiable Information(PII) in datasets by combining pattern match and machine learning solution. + +Overview +-------- + +**Introduction to PII** + +Personal Identifiable Information (PII) refers to any information that can identify an individual, encompassing financial, medical, educational, and employment records. Failure to protect Personal Identifiable Information (PII) can lead to identity theft, financial loss, and reputational damage of individuals and businesses alike, highlighting the importance of taking appropriate measures to safeguard sensitive information. The Operators framework is OCI's most extensible, low-code, managed ecosystem for detecting and redacting pii in dataset. + +This technical documentation introduces using ``ads opctl`` for detecting and redacting pii tasks. This module is engineered with the principles of low-code development in mind, making it accessible to users with varying degrees of technical expertise. It operates on managed infrastructure, ensuring reliability and scalability, while its configurability through YAML allows users to customize redaction to their specific needs. + +**Automated Detection and Classification** + +By leveraging pattern matching and AI-powered solution, the ADS PII Operator efficiently identifies sentitive data on free form texts. + +**Intelligent Co-reference Resolution** + +A standout feature of the ADS PII Operator is its ability to maintain co-reference entity relationships even after anonymization, this not only anonymizes the data, but peserves the statistical properties of the data. + +**PII Operator Documentation** + +This documentation will explore the key concepts and capabilities of the PII operator, providing examples and practical guidance on how to use its various functions and modules. By the end of this guide, users will have a solid understanding of the PII operator and its capabilities, as well as the knowledge and tools needed to make informed decisions when designing solutions tailored to their specific requirements. + +.. versionadded:: 2.9.0 + +.. toctree:: + :maxdepth: 1 + + ./install + ./getting_started + ./pii + ./examples + ./yaml_schema diff --git a/docs/source/user_guide/operators/pii_operator/install.rst b/docs/source/user_guide/operators/pii_operator/install.rst new file mode 100644 index 000000000..ae581315b --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/install.rst @@ -0,0 +1,13 @@ +=========================== +Installing the PII Operator +=========================== + +The PII Operator can be installed from PyPi. + + +.. code-block:: bash + + python3 -m pip install oracle_ads[pii] + + +After that, the Operator is ready to go! diff --git a/docs/source/user_guide/operators/pii_operator/pii.rst b/docs/source/user_guide/operators/pii_operator/pii.rst new file mode 100644 index 000000000..617467e8b --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/pii.rst @@ -0,0 +1,47 @@ +============= +Configure PII +============= + +Let's explore each line of the pii.yaml so we can better understand options for extending and customizing the operator to our use case. + +Here is an example pii.yaml with every parameter specified: + +.. code-block:: yaml + + kind: operator + type: pii + version: v1 + spec: + output_directory: + url: oci://my-bucket@my-tenancy/results + name: mydata-out.csv + report: + report_filename: report.html + show_rows: 10 + show_sensitive_content: true + input_data: + url: oci://my-bucket@my-tenancy/mydata.csv + target_column: target + detectors: + - name: default.phone + action: anonymize + + +* **Kind**: The yaml file always starts with ``kind: operator``. There are many other kinds of yaml files that can be run by ``ads opctl``, so we need to specify this is an operator. +* **Type**: The type of operator is ``pii``. +* **Version**: The only available version is ``v1``. +* **Spec**: Spec contains the bulk of the information for the specific problem. + * **input_data**: This dictionary contains the details for how to read the input data. + * **url**: Insert the uri for the dataset if it's on object storage using the URI pattern ``oci://@/path/to/data.csv``. + * **target_column**: This string specifies the name of the column where the user data is within the input data. + * **detectors**: This list contains the details for each detector and action that will be taken. + * **name**: The string specifies the name of the detector. The format should be ``.``. + * **action**: The string specifies the way to process the detected entity. Default to mask. + * **output_directory**: This dictionary contains the details for where to put the output artifacts. The directory need not exist, but must be accessible by the Operator during runtime. + * **url**: Insert the uri for the dataset if it's on object storage using the URI pattern ``oci://@/subfolder/``. + * **name**: The string specifies the name of the processed data file. + + * **report**: (optional) This dictionary specific details for the generated report. + * **report_filename**: Placed into output_directory location. Defaults to report.html. + * **show_sensitive_content**: Whether to show sensitive content in the report. Defaults to false. + * **show_rows**: The number of rows that shows in the report. diff --git a/docs/source/user_guide/operators/pii_operator/yaml_schema.rst b/docs/source/user_guide/operators/pii_operator/yaml_schema.rst new file mode 100644 index 000000000..10cdb58ce --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/yaml_schema.rst @@ -0,0 +1,9 @@ +=========== +YAML Schema +=========== + +Following is the YAML schema for validating the YAML using `Cerberus `_: + +.. literalinclude:: ../common/yaml_schema/piiOperator.yaml + :language: yaml + :linenos: diff --git a/pyproject.toml b/pyproject.toml index d861f5bee..da01c2a71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ classifiers = [ # In dependencies se "; platform_machine == 'aarch64'" to specify ARM underlying platform # Copied from install_requires list in setup.py, setup.py got removed in favor of this config file dependencies = [ + "PyYAML>=6", # pyyaml 5.4 is broken with cython 3 "asteval>=0.9.25", "cerberus>=1.3.4", "cloudpickle>=1.6.0", @@ -67,7 +68,6 @@ dependencies = [ "pandas>1.2.1,<2.1", "psutil>=5.7.2", "python_jsonschema_objects>=0.3.13", - "PyYAML>=6", # pyyaml 5.4 is broken with cython 3 "requests", "scikit-learn>=1.0", "tabulate>=0.8.9", @@ -168,6 +168,16 @@ forecast = [ "autots[additional]", "neuralprophet", ] +pii = [ + "aiohttp", + "datapane", + "gender_guesser", + "nameparser", + "oracle_ads[opctl]", + "plotly", + "scrubadub", + "scrubadub_spacy", +] [project.urls] "Github" = "https://github.com/oracle/accelerated-data-science" diff --git a/ads/opctl/operator/lowcode/pii/const.py b/tests/unitary/with_extras/operator/pii/__init__.py similarity index 87% rename from ads/opctl/operator/lowcode/pii/const.py rename to tests/unitary/with_extras/operator/pii/__init__.py index b8d0460f5..fe904ad27 100644 --- a/ads/opctl/operator/lowcode/pii/const.py +++ b/tests/unitary/with_extras/operator/pii/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/tests/unitary/with_extras/operator/pii/test_factory.py b/tests/unitary/with_extras/operator/pii/test_factory.py new file mode 100644 index 000000000..431034bda --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_factory.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import pytest +from scrubadub_spacy.detectors.spacy import SpacyEntityDetector + +from ads.opctl.operator.lowcode.pii.model.factory import ( + PiiDetectorFactory, + UnSupportedDetectorError, +) + + +class TestPiiDetectorFactory: + def test_get_default_detector(self): + detector_type = "default" + entity = "phone" + model = None + expected_result = "phone" + detector = PiiDetectorFactory.get_detector( + detector_type=detector_type, entity=entity, model=model + ) + assert detector == expected_result + + @pytest.mark.parametrize( + "detector_type, entity, model", + [ + ("spacy", "person", "en_core_web_trf"), + ("spacy", "other", "en_core_web_trf"), + ], + ) + def test_get_spacy_detector(self, detector_type, entity, model): + detector = PiiDetectorFactory.get_detector( + detector_type=detector_type, entity=entity, model=model + ) + assert isinstance(detector, SpacyEntityDetector) + assert entity.upper() in detector.filth_cls_map + + def test_get_detector_fail(self): + detector_type = "unknow" + entity = "myentity" + model = None + with pytest.raises(UnSupportedDetectorError): + PiiDetectorFactory.get_detector( + detector_type=detector_type, entity=entity, model=model + ) diff --git a/tests/unitary/with_extras/operator/pii/test_files/__init__.py b/tests/unitary/with_extras/operator/pii/test_files/__init__.py new file mode 100644 index 000000000..fe904ad27 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_files/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/tests/unitary/with_extras/operator/pii/test_files/pii_test.yaml b/tests/unitary/with_extras/operator/pii/test_files/pii_test.yaml new file mode 100644 index 000000000..b9ef962b4 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_files/pii_test.yaml @@ -0,0 +1,14 @@ +kind: operator +spec: + detectors: + - action: anonymize + name: default.phone + - action: mask + name: default.text_blob_name + input_data: + url: ./test_data.csv + output_directory: + url: ./test_result/ + target_column: text +type: pii +version: v1 diff --git a/tests/unitary/with_extras/operator/pii/test_files/test_data.csv b/tests/unitary/with_extras/operator/pii/test_files/test_data.csv new file mode 100644 index 000000000..250e24577 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_files/test_data.csv @@ -0,0 +1,3 @@ +id,text +00001cee341fdb12,"Hi, this is John Doe, my number is (805) 555-1234." +00097b6214686db5,"John has a beautiful puppy." diff --git a/tests/unitary/with_extras/operator/pii/test_guardrail.py b/tests/unitary/with_extras/operator/pii/test_guardrail.py new file mode 100644 index 000000000..ae8c7be60 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_guardrail.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import tempfile +from io import StringIO + +import yaml + +from ads.opctl.operator.lowcode.pii.constant import DEFAULT_REPORT_FILENAME +from ads.opctl.operator.lowcode.pii.model.guardrails import PIIGuardrail +from ads.opctl.operator.lowcode.pii.operator_config import PiiOperatorConfig + + +class TestPiiGuardrail: + test_files_uri = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_files" + ) + + def yaml_content_simple(self): + content = StringIO( + f""" +kind: operator +spec: + detectors: + - action: anonymize + name: default.phone + input_data: + url: {self.test_files_uri}/test_data.csv + output_directory: + url: {self.test_files_uri} + target_column: text +type: pii +version: v1 + +""" + ) + return content + + def yaml_content_complex(self): + content = StringIO( + """ +kind: operator +spec: + detectors: + - action: anonymize + name: default.phone + - action: mask + name: default.social_security_number + input_data: + url: oci://my-bucket@my-tenancy/input_data/mydata.csv + output_directory: + name: myProcesseData.csv + url: oci://my-bucket@my-tenancy/result/ + report: + report_filename: myreport.html + show_sensitive_content: true + show_rows: 10 + target_column: text +type: pii +version: v1 + +""" + ) + return content + + def test_init(self): + conf = yaml.load(self.yaml_content_complex(), yaml.SafeLoader) + operator_config = PiiOperatorConfig.from_yaml( + yaml_string=self.yaml_content_complex() + ) + guardrail = PIIGuardrail(config=operator_config) + + assert guardrail.dst_uri == os.path.join( + conf["spec"]["output_directory"]["url"], + conf["spec"]["output_directory"]["name"], + ) + assert guardrail.report_uri == os.path.join( + conf["spec"]["output_directory"]["url"], + conf["spec"]["report"]["report_filename"], + ) + assert len(guardrail.scrubber._detectors) == 2 + assert not guardrail.storage_options == {} + + def test_load_data(self): + conf = yaml.load(self.yaml_content_simple(), yaml.SafeLoader) + + operator_config = PiiOperatorConfig.from_yaml( + yaml_string=self.yaml_content_simple() + ) + guardrail = PIIGuardrail(config=operator_config) + guardrail.load_data() + + assert guardrail.datasets is not None + assert guardrail.storage_options == {} + assert guardrail.dst_uri == os.path.join( + conf["spec"]["output_directory"]["url"], + "test_data_out.csv", + ) + assert guardrail.report_uri == os.path.join( + conf["spec"]["output_directory"]["url"], + DEFAULT_REPORT_FILENAME, + ) + + def test_process(self): + operator_config = PiiOperatorConfig.from_yaml( + yaml_string=self.yaml_content_simple() + ) + guardrail = PIIGuardrail(config=operator_config) + with tempfile.TemporaryDirectory() as temp_dir: + dst_uri = os.path.join(temp_dir, "test_out.csv") + report_uri = os.path.join(temp_dir, DEFAULT_REPORT_FILENAME) + guardrail.process( + dst_uri=dst_uri, + report_uri=report_uri, + ) + assert os.path.exists(dst_uri) + assert os.path.exists(report_uri) diff --git a/tests/unitary/with_extras/operator/pii/test_pii_scrubber.py b/tests/unitary/with_extras/operator/pii/test_pii_scrubber.py new file mode 100644 index 000000000..df2929a06 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_pii_scrubber.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import os + +import pytest + +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.lowcode.pii.model.pii import PiiScrubber +from ads.opctl.operator.lowcode.pii.operator_config import PiiOperatorConfig + + +class TestPiiScrubber: + test_yaml_uri = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_files", "pii_test.yaml" + ) + operator_config = PiiOperatorConfig.from_yaml(uri=test_yaml_uri) + config_dict = _load_yaml_from_uri(uri=test_yaml_uri) + + name_entity = "John Doe" + phone_entity = "(800) 223-1711" + text = f""" + This is {name_entity}. My number is {phone_entity}. + """ + + @pytest.mark.parametrize( + "config", + [ + test_yaml_uri, + operator_config, + config_dict, + ], + ) + def test_init(self, config): + pii_scrubber = PiiScrubber(config=config) + + assert isinstance(pii_scrubber.detector_spec, list) + assert len(pii_scrubber.detector_spec) == 2 + assert pii_scrubber.detector_spec[0]["name"] == "default.phone" + + assert len(pii_scrubber.scrubber._detectors) == 0 + + def test_config_scrubber(self): + scrubber = PiiScrubber(config=self.test_yaml_uri).config_scrubber() + + assert len(scrubber._detectors) == 2 + assert len(scrubber._post_processors) == 1 + + processed_text = scrubber.clean(self.text) + + assert self.name_entity not in processed_text + assert self.phone_entity not in processed_text From 16c62222ce4a3fe7ef08c727e55f51eda0035b4c Mon Sep 17 00:00:00 2001 From: MING KANG Date: Tue, 14 Nov 2023 16:00:58 -0800 Subject: [PATCH 3/7] updated pii operator's dependency --- ads/opctl/operator/lowcode/pii/README.md | 7 +++++-- ads/opctl/operator/lowcode/pii/environment.yaml | 3 ++- pyproject.toml | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ads/opctl/operator/lowcode/pii/README.md b/ads/opctl/operator/lowcode/pii/README.md index 156646ef4..f24cda8ce 100644 --- a/ads/opctl/operator/lowcode/pii/README.md +++ b/ads/opctl/operator/lowcode/pii/README.md @@ -32,12 +32,15 @@ All generated configurations should be ready to use without the need for any add To run pii operator locally, create and activate a new conda environment (`ads-pii`). Install all the required libraries listed in the `environment.yaml` file. ```yaml +- aiohttp - datapane -- scrubadub - gender_guesser - nameparser +- plotly +- spacy_transformers +- scrubadub - scrubadub_spacy -- "git+https://github.com/oracle/accelerated-data-science.git@feature/ads_pii_operator#egg=oracle-ads" +- oracle_ads[opctl] ``` Please review the previously generated `pii.yaml` file using the `init` command, and make any necessary adjustments to the input and output file locations. By default, it assumes that the files should be located in the same folder from which the `init` command was executed. diff --git a/ads/opctl/operator/lowcode/pii/environment.yaml b/ads/opctl/operator/lowcode/pii/environment.yaml index ca5b65680..4f5b75b67 100644 --- a/ads/opctl/operator/lowcode/pii/environment.yaml +++ b/ads/opctl/operator/lowcode/pii/environment.yaml @@ -10,6 +10,7 @@ dependencies: - gender_guesser - nameparser - plotly + - spacy_transformers - scrubadub - scrubadub_spacy - - "git+https://github.com/oracle/accelerated-data-science.git@feature/ads_pii_operator#egg=oracle-ads" + - oracle_ads[opctl] diff --git a/pyproject.toml b/pyproject.toml index da01c2a71..e38cdf8c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -175,6 +175,7 @@ pii = [ "nameparser", "oracle_ads[opctl]", "plotly", + "spacy_transformers, "scrubadub", "scrubadub_spacy", ] From c45427dc5b52c4b803c009aa5be1a3ea962685dc Mon Sep 17 00:00:00 2001 From: MING KANG Date: Tue, 14 Nov 2023 16:15:12 -0800 Subject: [PATCH 4/7] fixed docs --- .../common/yaml_schema/piiOperator.yaml | 108 ------------------ .../operators/pii_operator/yaml_schema.rst | 2 +- 2 files changed, 1 insertion(+), 109 deletions(-) delete mode 100644 docs/source/user_guide/operators/common/yaml_schema/piiOperator.yaml diff --git a/docs/source/user_guide/operators/common/yaml_schema/piiOperator.yaml b/docs/source/user_guide/operators/common/yaml_schema/piiOperator.yaml deleted file mode 100644 index ff295c7fa..000000000 --- a/docs/source/user_guide/operators/common/yaml_schema/piiOperator.yaml +++ /dev/null @@ -1,108 +0,0 @@ -kind: - allowed: - - operator - required: true - type: string - default: operator - meta: - description: "Which service are you trying to use? Common kinds: `operator`, `job`" - -version: - allowed: - - "v1" - required: true - type: string - default: v1 - meta: - description: "Operators may change yaml file schemas from version to version, as well as implementation details. Double check the version to ensure compatibility." - -type: - required: true - type: string - default: pii - meta: - description: "Type should always be `pii` when using a pii operator" - - -spec: - required: true - schema: - input_data: - required: true - type: dict - meta: - description: "This should be indexed by target column." - schema: - url: - required: true - type: string - default: data.csv - meta: - description: "The url can be local, or remote. For example: `oci://@/data.csv`" - - output_directory: - required: true - schema: - url: - required: true - type: string - default: result/ - meta: - description: "The url can be local, or remote. For example: `oci://@/`" - name: - required: false - type: string - default: data-out.csv - type: dict - - report: - required: false - schema: - report_filename: - required: true - type: string - default: report.html - meta: - description: "Placed into `output_directory` location. Defaults to `report.html`" - show_rows: - required: false - type: number - meta: - description: "The number of rows that shows in the report. Defaults to `10`" - show_sensitive_content: - required: true - default: false - type: boolean - meta: - description: "Whether to show sensitive content in the report. Defaults to `False`" - type: dict - - target_column: - type: string - required: true - default: target - meta: - description: "Column with user data." - - detectors: - type: list - required: true - schema: - type: dict - schema: - name: - required: true - type: string - meta: - description: "The name of the detector. THe format is `.`." - action: - required: true - type: string - default: mask - allowed: - - anonymize - - mask - - remove - meta: - description: "The way to process the detected entity. Default to `mask`." - type: dict diff --git a/docs/source/user_guide/operators/pii_operator/yaml_schema.rst b/docs/source/user_guide/operators/pii_operator/yaml_schema.rst index 10cdb58ce..ee1318f8c 100644 --- a/docs/source/user_guide/operators/pii_operator/yaml_schema.rst +++ b/docs/source/user_guide/operators/pii_operator/yaml_schema.rst @@ -4,6 +4,6 @@ YAML Schema Following is the YAML schema for validating the YAML using `Cerberus `_: -.. literalinclude:: ../common/yaml_schema/piiOperator.yaml +.. literalinclude:: ../../../../../ads/opctl/operator/lowcode/pii/schema.yaml :language: yaml :linenos: From 893656c3637998be34cc63771d64f14e42a44313 Mon Sep 17 00:00:00 2001 From: MING KANG Date: Tue, 14 Nov 2023 16:59:54 -0800 Subject: [PATCH 5/7] fixed tests --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e38cdf8c2..a950d663e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -175,7 +175,7 @@ pii = [ "nameparser", "oracle_ads[opctl]", "plotly", - "spacy_transformers, + "spacy_transformers", "scrubadub", "scrubadub_spacy", ] From 52280d17ba567805c46bd3e87bc4718f2417336a Mon Sep 17 00:00:00 2001 From: MING KANG Date: Wed, 15 Nov 2023 18:17:00 -0800 Subject: [PATCH 6/7] improved documentation for configuring detectors --- .../pii_operator/getting_started.rst | 12 +-- .../operators/pii_operator/install.rst | 13 ++- .../user_guide/operators/pii_operator/pii.rst | 91 ++++++++++++++++++- .../operators/pii_operator/yaml_schema.rst | 2 + 4 files changed, 108 insertions(+), 10 deletions(-) diff --git a/docs/source/user_guide/operators/pii_operator/getting_started.rst b/docs/source/user_guide/operators/pii_operator/getting_started.rst index a8c455ded..a5ce67d6a 100644 --- a/docs/source/user_guide/operators/pii_operator/getting_started.rst +++ b/docs/source/user_guide/operators/pii_operator/getting_started.rst @@ -10,9 +10,9 @@ After having set up ``ads opctl`` on your desired machine using ``ads opctl conf - Path to the input data (input_data) - Path to the output directory, where the operator will place the processed data and report.html produced from the run (output_directory) - Name of the column with user data (target_column) -- Name of the detector will be used in the operator (detectors) +- The detector will be used in the operator (detectors) -These details exactly match the initial pii.yaml file generated by running ``ads operator init --type pii``: +You can check :ref:`Configure Detector ` for more details on how to configure ``detectors`` parameter. These details exactly match the initial pii.yaml file generated by running ``ads operator init --type pii``: .. code-block:: yaml @@ -32,10 +32,10 @@ These details exactly match the initial pii.yaml file generated by running ``ads Optionally, you are able to specify much more. The most common additions are: -- Whether to show sensitive content in the report. (show_sensitive_content) -- Way to process the detected entity. (action) +- Whether to show sensitive content in the report (show_sensitive_content) +- Way to process the detected entity (action) -An extensive list of parameters can be found in the ``YAML Schema`` section. +An extensive list of parameters can be found in the :ref:`YAML Schema `. Run @@ -57,7 +57,7 @@ We will go through each of these output files in turn. **mydata-out.csv** -The name of this file can be customized based on output_directory parameters in the configuration yaml. This file contains the processed dataset. +The name of this file can be customized based on ``output_directory`` parameters in the configuration yaml. This file contains the processed dataset. **report.html** diff --git a/docs/source/user_guide/operators/pii_operator/install.rst b/docs/source/user_guide/operators/pii_operator/install.rst index ae581315b..7386f69cf 100644 --- a/docs/source/user_guide/operators/pii_operator/install.rst +++ b/docs/source/user_guide/operators/pii_operator/install.rst @@ -7,7 +7,18 @@ The PII Operator can be installed from PyPi. .. code-block:: bash - python3 -m pip install oracle_ads[pii] + python3 -m pip install oracle_ads[pii]==2.9 After that, the Operator is ready to go! + +In order to run on a job, you will need to create and publish a conda pack with ``oracle_ads[pii]`` installed. The simplest way to do this is from a Notebook Session, running the following commands: + +.. code-block:: bash + + odsc conda create -n ads_pii -e + conda activate /home/datascience/conda/ads_pii_v1_0 + python3 -m pip install oracle-ads[pii]==2.9 + odsc conda publish -s /home/datascience/conda/ads_pii_v1_0 + +Ensure that you have properly configured your conda pack namespace and bucket in the Launcher -> Settings -> Object Storage Settings. For more details, see :doc:`ADS Conda Set Up <../../cli/opctl/configure>` diff --git a/docs/source/user_guide/operators/pii_operator/pii.rst b/docs/source/user_guide/operators/pii_operator/pii.rst index 617467e8b..92cc47254 100644 --- a/docs/source/user_guide/operators/pii_operator/pii.rst +++ b/docs/source/user_guide/operators/pii_operator/pii.rst @@ -35,13 +35,98 @@ Here is an example pii.yaml with every parameter specified: * **url**: Insert the uri for the dataset if it's on object storage using the URI pattern ``oci://@/path/to/data.csv``. * **target_column**: This string specifies the name of the column where the user data is within the input data. * **detectors**: This list contains the details for each detector and action that will be taken. - * **name**: The string specifies the name of the detector. The format should be ``.``. + * **name**: The string specifies the name of the detector. The format should be ``.``. Check :ref:`Configure Detector ` for more details. * **action**: The string specifies the way to process the detected entity. Default to mask. * **output_directory**: This dictionary contains the details for where to put the output artifacts. The directory need not exist, but must be accessible by the Operator during runtime. * **url**: Insert the uri for the dataset if it's on object storage using the URI pattern ``oci://@/subfolder/``. * **name**: The string specifies the name of the processed data file. * **report**: (optional) This dictionary specific details for the generated report. - * **report_filename**: Placed into output_directory location. Defaults to report.html. - * **show_sensitive_content**: Whether to show sensitive content in the report. Defaults to false. + * **report_filename**: Placed into output_directory location. Defaults to ``report.html``. + * **show_sensitive_content**: Whether to show sensitive content in the report. Defaults to ``false``. * **show_rows**: The number of rows that shows in the report. + + +.. _config_detector: + +Configure Detector +------------------ + +A detector consists of ``name`` and ``action``. The **name** parameter defines the detector that will be used, and the **action** parameter defines the way to process the entity. + +Configure Name +~~~~~~~~~~~~~~ + +We currently support the following type of detectors: + +* default +* spacy + +Default +^^^^^^^ + +Here scrubadub's pre-defined detector is used. You can designate the name in the format of ``default.`` (e.g., ``default.phone``). Check the supported detectors from `scrubadub `_. + +.. note:: + + If you want to de-identify `address` by this tool, `scrubadub_address` is required. + You will need to follow the `instructions`_ to install the required dependencies. + + .. _instructions: https://scrubadub.readthedocs.io/en/stable/addresses.html/ + + +spaCy +^^^^^ + +To use spaCy’s NER to identify entity, you can designate the name in the format of ``spacy..`` (e.g., ``spacy.en_core_web_sm.person``). +The "entity" value can correspond to any entity that spaCy recognizes. For a list of available models and entities, please refer to the `spaCy documentation `_. + + + +Configure Action +~~~~~~~~~~~~~~~~ + +We currently support the following types of actions: + +* mask +* remove +* anonymize + +Mask +^^^^ + +The ``mask`` action is used to mask the detected entity with the name of the entity type. It replaces the entity with a placeholder. For example, with the following configured detector: + +.. code-block:: yaml + + name: spacy.en_core_web_sm.person + action: mask + +After processing, the input text "Hi, my name is John Doe." will become "Hi, my name is {{NAME}}." + +Remove +^^^^^^ + +The ``remove`` action is used to delete the detected entity from the text. It completely removes the entity without replacement. For example, with the following configured detector: + +.. code-block:: yaml + + name: spacy.en_core_web_sm.person + action: remove + +After processing, the input text "Hi, my name is John Doe." will become "Hi, my name is ." + + +Anonymize +^^^^^^^^^ + +The ``anonymize`` action can be used to obfuscate the detected sensitive information. +Currently, we provide context-aware anonymization for name, email, and number-like entities. +For example, with the following configured detector: + +.. code-block:: yaml + + name: spacy.en_core_web_sm.person + action: anonymize + +After processing, the input text "Hi, my name is John Doe." will become "Hi, my name is Joe Blow." diff --git a/docs/source/user_guide/operators/pii_operator/yaml_schema.rst b/docs/source/user_guide/operators/pii_operator/yaml_schema.rst index ee1318f8c..6a887b5e1 100644 --- a/docs/source/user_guide/operators/pii_operator/yaml_schema.rst +++ b/docs/source/user_guide/operators/pii_operator/yaml_schema.rst @@ -1,3 +1,5 @@ +.. _pii-yaml-schema: + =========== YAML Schema =========== From 2929f8bbef87b2cbcc490bf3f9db2687b9d903a3 Mon Sep 17 00:00:00 2001 From: MING KANG Date: Wed, 15 Nov 2023 18:18:19 -0800 Subject: [PATCH 7/7] fixed ValueError: [E002] Can't find factory for 'curated_transformer' for language English (en). for loading en_core_web_trf --- ads/opctl/operator/lowcode/pii/README.md | 5 ++-- .../operator/lowcode/pii/environment.yaml | 5 ++-- pyproject.toml | 27 ++++++++++--------- .../with_extras/operator/pii/test_factory.py | 7 +++-- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/ads/opctl/operator/lowcode/pii/README.md b/ads/opctl/operator/lowcode/pii/README.md index f24cda8ce..59b2c43f8 100644 --- a/ads/opctl/operator/lowcode/pii/README.md +++ b/ads/opctl/operator/lowcode/pii/README.md @@ -36,11 +36,12 @@ To run pii operator locally, create and activate a new conda environment (`ads-p - datapane - gender_guesser - nameparser +- oracle_ads[opctl] - plotly -- spacy_transformers - scrubadub - scrubadub_spacy -- oracle_ads[opctl] +- spacy-transformers==1.2.5 +- spacy==3.6.1 ``` Please review the previously generated `pii.yaml` file using the `init` command, and make any necessary adjustments to the input and output file locations. By default, it assumes that the files should be located in the same folder from which the `init` command was executed. diff --git a/ads/opctl/operator/lowcode/pii/environment.yaml b/ads/opctl/operator/lowcode/pii/environment.yaml index 4f5b75b67..ffd60045e 100644 --- a/ads/opctl/operator/lowcode/pii/environment.yaml +++ b/ads/opctl/operator/lowcode/pii/environment.yaml @@ -9,8 +9,9 @@ dependencies: - datapane - gender_guesser - nameparser + - oracle_ads[opctl] - plotly - - spacy_transformers - scrubadub - scrubadub_spacy - - oracle_ads[opctl] + - spacy-transformers==1.2.5 + - spacy==3.6.1 diff --git a/pyproject.toml b/pyproject.toml index 32c34574a..c8caf66bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,8 +123,8 @@ opctl = [ "nbconvert", "nbformat", "oci-cli", - "rich", "py-cpuinfo", + "rich", ] optuna = [ "optuna==2.9.0", @@ -154,20 +154,20 @@ viz = [ "seaborn>=0.11.0", ] forecast = [ + "autots[additional]", "datapane", - "prophet", - "pmdarima", - "statsmodels", - "sktime", - "optuna==2.9.0", - "oci-cli", - "shap", - "numpy", "holidays==0.21.13", + "neuralprophet", + "numpy", + "oci-cli", + "optuna==2.9.0", "oracle-ads[opctl]", "oracle-automlx==23.2.3", - "autots[additional]", - "neuralprophet", + "pmdarima", + "prophet", + "shap", + "sktime", + "statsmodels", ] pii = [ "aiohttp", @@ -176,9 +176,10 @@ pii = [ "nameparser", "oracle_ads[opctl]", "plotly", - "spacy_transformers", - "scrubadub", + "scrubadub==2.0.1", "scrubadub_spacy", + "spacy-transformers==1.2.5", + "spacy==3.6.1", ] [project.urls] diff --git a/tests/unitary/with_extras/operator/pii/test_factory.py b/tests/unitary/with_extras/operator/pii/test_factory.py index 431034bda..04e153cd0 100644 --- a/tests/unitary/with_extras/operator/pii/test_factory.py +++ b/tests/unitary/with_extras/operator/pii/test_factory.py @@ -25,8 +25,11 @@ def test_get_default_detector(self): @pytest.mark.parametrize( "detector_type, entity, model", [ - ("spacy", "person", "en_core_web_trf"), - ("spacy", "other", "en_core_web_trf"), + ("spacy", "person", "en_core_web_sm"), + ("spacy", "other", "en_core_web_sm"), + # ("spacy", "org", "en_core_web_trf"), + # ("spacy", "loc", "en_core_web_md"), + # ("spacy", "date", "en_core_web_lg"), ], ) def test_get_spacy_detector(self, detector_type, entity, model):