In [None]:
from operator_pool import OperatorPool

op_pool = OperatorPool(config_path="./configs/default_ops.yaml")

In [2]:
# access op with index
op_pool[0].name

'image_watermark_filter'

In [3]:
# access op with name as key
op_pool["alphanumeric_filter"].desc

'Filter to keep samples with alphabet/numeric ratio within a specific range.'

In [4]:
# iteration
for op_name in op_pool.pool:
    print(op_name)

image_watermark_filter
language_id_score_filter
alphanumeric_filter


In [5]:
# state, used for llm query, save, and load
op_pool.state

{'image_watermark_filter': {'name': 'image_watermark_filter',
  'desc': 'Filter to keep samples whose images have no watermark with high probability.',
  'enabled': True,
  'args': {'hf_watermark_model': {'name': 'hf_watermark_model',
    'desc': 'watermark detection model name on huggingface.',
    'type': 'str',
    'default': 'amrul-hzz/watermark_detector',
    'v': 'amrul-hzz/watermark_detector',
    'options': None,
    'min': None,
    'max': None},
   'trust_remote_code': {'name': 'trust_remote_code',
    'desc': 'Whether to trust the remote code for loading huggingface model.',
    'type': 'bool',
    'default': False,
    'v': False,
    'options': [True, False],
    'min': None,
    'max': None},
   'prob_threshold': {'name': 'prob_threshold',
    'desc': 'the predicted watermark probability threshold for samples. range from 0 to 1. Samples with watermark probability less than this threshold will be kept.',
    'type': 'float',
    'default': 0.8,
    'v': 0.8,
    'options':

In [6]:
# export recipe
op_pool.export_config(
    project_name="demo",
    dataset_path="./data/demo-dataset.jsonl",
    nproc=4,
    export_path="./outputs/processed_data.jsonl",
    config_path="./configs/demo-recipe.yaml"
)

'./configs/demo-recipe.yaml'

In [7]:
# enable/disable an operator
print(op_pool["alphanumeric_filter"].enabled)
op_pool.act(op_name="alphanumeric_filter", action_type="disable")
print(op_pool["alphanumeric_filter"].enabled)
op_pool.act(op_name="alphanumeric_filter", action_type="enable")
print(op_pool["alphanumeric_filter"].enabled)

True
False
True


In [8]:
# set arg value
op_pool.act(op_name="alphanumeric_filter", action_type="set_arg",
            arg_name="min_ratio", v=0.2)
op_pool["alphanumeric_filter"].args["min_ratio"].v

0.2

In [9]:
stats = dict(mean=0.3, std=0.04, min=0.12, max=0.89, quantiles=[0.01 * i for i in range(101)])
op_pool["alphanumeric_filter"].update_with_stats(stats)
# set arg value as the p% quantile
op_pool.act(op_name="alphanumeric_filter", action_type="set_arg",
            arg_name="min_ratio", p=30) # p=0.3 is also acceptable
op_pool["alphanumeric_filter"].args["min_ratio"].v

0.3

In [10]:
# set filter args as mean \pm k * std
op_pool.act(op_name="alphanumeric_filter", action_type="set_arg",
            k=3)
op_pool["alphanumeric_filter"].args["min_ratio"].v, op_pool["alphanumeric_filter"].args["max_ratio"].v

(0.18, 0.42)