configs/config_all.yaml

# Process config example including:
#   - all global arguments
#   - all ops and their arguments

# global parameters
project_name: 'all'                                         # project name for distinguish your configs
dataset_path: '/path/to/your/dataset'                       # path to your dataset directory or file with weights(0.0-1.0), 1.0 as default.
                                                            # accepted format: 'weight1(optional) dataset1-path weight2(optional) dataset2-path'
export_path: '/path/to/result/dataset.jsonl'                # path to processed result dataset. Supported suffixes include ['jsonl', 'json', 'parquet']
export_shard_size: 0                                        # shard size of exported dataset in Byte. In default, it's 0, which means export the whole dataset into only one file. If it's set a positive number, the exported dataset will be split into several dataset shards, and the max size of each shard won't larger than the export_shard_size
export_in_parallel: false                                   # whether to export the result dataset in parallel to a single file, which usually takes less time. It only works when export_shard_size is 0, and its default number of processes is the same as the argument np. **Notice**: If it's True, sometimes exporting in parallel might require much more time due to the IO blocking, especially for very large datasets. When this happens, False is a better choice, although it takes more time.
np: 4                                                       # number of subprocess to process your dataset
text_keys: 'text'                                           # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...
                                                            # Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
suffixes: []                                                # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
use_cache: true                                             # whether to use the cache management of Hugging Face datasets. It might take up lots of disk space when using cache
ds_cache_dir: null                                          # cache dir for Hugging Face datasets. In default, it\'s the same as the environment variable `HF_DATASETS_CACHE`, whose default value is usually "~/.cache/huggingface/datasets". If this argument is set to a valid path by users, it will override the default cache dir
use_checkpoint: false                                       # whether to use the checkpoint management to save the latest version of dataset to work dir when processing. Rerun the same config will reload the checkpoint and skip ops before it. Cache will be disabled when using checkpoint. If args of ops before the checkpoint are changed, all ops will be rerun from the beginning.
temp_dir: null                                              # the path to the temp directory to store intermediate caches when cache is disabled, these cache files will be removed on-the-fly. In default, it's None, so the temp dir will be specified by system. NOTICE: you should be caution when setting this argument because it might cause unexpected program behaviors when this path is set to an unsafe directory.
open_tracer: false                                          # whether to open the tracer to trace the changes during process. It might take more time when opening tracer
op_list_to_trace: []                                        # only ops in this list will be traced by tracer. If it's empty, all ops will be traced. Only available when tracer is opened.
trace_num: 10                                               # number of samples to show the differences between datasets before and after each op. Only available when tracer is opened.
op_fusion: false                                            # whether to fuse operators that share the same intermediate variables automatically. Op fusion might reduce the memory requirements slightly but speed up the whole process.
cache_compress: null                                        # the compression method of the cache file, which can be specified in ['gzip', 'zstd', 'lz4']. If this parameter is None, the cache file will not be compressed. We recommend you turn on this argument when your input dataset is larger than tens of GB and your disk space is not enough.
keep_stats_in_res_ds: false                                 # whether to keep the computed stats in the result dataset. The intermediate fields to store the stats computed by Filters will be removed if it's False. It's False in default.
keep_hashes_in_res_ds: false                                # whether to keep the computed hashes in the result dataset. The intermediate fields to store the hashes computed by Deduplicators will be removed if it's False. It's False in default.

# for multimodal data processing
image_key: 'images'                                         # key name of field to store the list of sample image paths.
image_special_token: '<__dj__image>'                        # the special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
audio_key: 'audios'                                         # key name of field to store the list of sample audio paths.
audio_special_token: '<__dj__audio>'                        # the special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.
video_key: 'videos'                                         # key name of field to store the list of sample video paths.
video_special_token: '<__dj__video>'                        # the special token that represents a video in the text. In default, it's "<__dj__video>". You can specify your own special token according to your input dataset.

eoc_special_token: '<|__dj__eoc|>'                          # the special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.

# for distributed processing
executor_type: default                                      # type of executor, support "default" or "ray" for now.
ray_address: auto                                           # the address of the Ray cluster.

# only for data analysis
percentiles: [0.25, 0.5, 0.75]                              # percentiles to analyze the dataset distribution
export_original_dataset: false                              # whether to export the original dataset with stats. If you only need the stats of the dataset, setting it to false could speed up the exporting.
save_stats_in_one_file: false                               # whether to store all stats result into one file

# for sandbox or hpo
data_probe_algo: 'uniform'                                  # sampling algorithm for dataset. Should be one of ["uniform", "frequency_specified_field_selector", "topk_specified_field_selector"]. It's "uniform" in default. Only used for dataset sampling.
data_probe_ratio: 1.0                                       # the sampling ratio to the original dataset size. It's 1.0 in default. Only used for dataset sampling.
hpo_config: null                                            # path to a configuration file when using auto-HPO tool.

# process schedule: a list of several process operators with their arguments
process:
  # Mapper ops. Most of these ops need no arguments.
  - audio_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg audio filters
  - chinese_convert_mapper:                                 # convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.
      mode: 's2t'                                             # choose the mode to convert Chinese: ['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t']
  - clean_email_mapper:                                     # remove emails from text.
  - clean_html_mapper:                                      # remove html formats form text.
  - clean_ip_mapper:                                        # remove ip addresses from text.
  - clean_links_mapper:                                     # remove web links from text.
  - clean_copyright_mapper:                                 # remove copyright comments.
  - expand_macro_mapper:                                    # expand macro definitions in Latex text.
  - extract_qa_mapper:                                      # mapper to extract question and answer pair from text.
      hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'           # model name on huggingface to extract question and answer pair.
      pattern: null                                           # regular expression pattern to search for within text.
      qa_format: 'chatml'                                     # Output format of question and answer pair.
      enable_vllm: true                                       # Whether to use vllm for inference acceleration.
      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
  - fix_unicode_mapper:                                     # fix unicode errors in text.
  - generate_instruction_mapper:                            # generate new instruction text data.
      hf_model: 'Qwen/Qwen-7B-Chat'                           # model name on huggingface to generate instruction.
      seed_file: 'demos/data/demo-dataset-chatml.jsonl'       # Seed file as instruction samples to generate new instructions, chatml format.
      instruct_num: 3                                         # the number of generated samples.
      similarity_threshold: 0.7                               # the similarity score threshold between the generated samples and the seed samples.Range from 0 to 1. Samples with similarity score less than this threshold will be kept.
      prompt_template: null                                   # Prompt template for generate samples. Please make sure the template contains "{augmented_data}", which corresponds to the augmented samples.
      qa_pair_template: null                                  # Prompt template for generate question and answer pair description. Please make sure the template contains two "{}" to format question and answer. Default: '【问题】\n{}\n【回答】\n{}\n'.
      example_template: null                                  # Prompt template for generate examples. Please make sure the template contains "{qa_pairs}", which corresponds to the question and answer pair description generated by param `qa_pair_template`.
      qa_extraction_pattern: null                             # Regular expression pattern for parsing question and answer from model response.
      enable_vllm: true                                       # Whether to use vllm for inference acceleration.
      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
  - image_blur_mapper:                                      # mapper to blur images.
      p: 0.2                                                  # probability of the image being blured
      blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
      radius: 2                                               # radius of blur kernel
  - image_captioning_from_gpt4v_mapper:                     # generate samples whose texts are generated based on gpt-4-visison and the image
      mode: 'description'                                     # mode of text generated from images, can be one of ['resoning', 'description', 'conversation', 'custom']
      api_key: ''                                             # the API key to authenticate the request
      max_token: 500                                          # the maximum number of tokens to generate. Default is 500.
      temperature: 1.0                                        # controls the randomness of the output (range from 0 to 1). Default is 0.
      system_prompt: ''                                       # a string prompt used to set the context of a conversation and provide global guidance or rules for the gpt4-vision so that it can  generate responses in the expected way. If `mode` set to `custom`, the parameter will be used
      user_prompt: ''                                         # a string prompt to guide the generation of gpt4-vision for each samples. It's "" in default, which means no prompt provided
      user_prompt_key: null                                   # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated text in the final datasets and the original text will be removed. It's True in default
      any_or_all: 'any'                                       # keep this sample with 'any' or 'all' strategy of all images. 'any': keep this sample if any images meet the condition. 'all': keep this sample only if all images meet the condition
  - image_captioning_mapper:                                # generate captions for images to augment datasets
      hf_img2seq: 'Salesforce/blip2-opt-2.7b'                 # model name on huggingface to generate caption
      caption_num: 1                                          # how many candidate captions to generate for each image
      keep_candidate_mode: 'random_any'                       # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
      prompt: null                                            # a string prompt to guide the generation of blip2 model for all samples globally. It's None in default, which means no prompt provided.
      prompt_key: null                                        # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
      mem_required: '16GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - image_diffusion_mapper:                                 # generate images by diffusion model
      hf_diffusion: 'CompVis/stable-diffusion-v1-4'           # stable diffusion model name on huggingface to generate image
      torch_dtype: 'fp32'                                     # the floating point type used to load the diffusion model. Can be one of ['fp32', 'fp16', 'bf16']
      revision: 'main'                                        # The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier allowed by Git.
      strength: 0.8                                           # parameter of stable diffusion model, indicates extent to transform the reference image. will ignore the input image if it equals to 1
      guidance_scale: 7.5                                     # parameter of stable diffusion model, a higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality
      aug_num: 1                                              # the number of images to generate
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated images in the final datasets and the original images will be removed. It's True in default.
      caption_key: null                                       # the key name of fields in samples to store captions for each images, the caption guide the diffusion model to produce what the image is
      hf_img2seq: 'Salesforce/blip2-opt-2.7b'                 # model name on huggingface to generate caption if caption_key is null
      mem_required: '8GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - image_face_blur_mapper:                                 # blur faces detected in images
      cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
      blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
      radius: 2                                               # radius of blur kernel
  - image_tagging_mapper:                                   # Mapper to generate image tags.
      tag_field_name: '__dj__image_tags__'                    # the field name to store the tags. It's "__dj__image_tags__" in default.
  - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
      sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
      aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated texts in the final datasets and the original texts will be removed. It's True in default.
      delete_random_word: false                               # whether to open the augmentation method of deleting random words from the original texts. e.g. "I love LLM" --> "I LLM"
      swap_random_word: false                                 # whether to open the augmentation method of swapping random contiguous words in the original texts. e.g. "I love LLM" --> "Love I LLM"
      spelling_error_word: false                              # whether to open the augmentation method of simulating the spelling error for words in the original texts. e.g. "I love LLM" --> "Ai love LLM"
      split_random_word: false                                # whether to open the augmentation method of splitting words randomly with whitespaces in the original texts. e.g. "I love LLM" --> "I love LL M"
      keyboard_error_char: false                              # whether to open the augmentation method of simulating the keyboard error for characters in the original texts. e.g. "I love LLM" --> "I ;ov4 LLM"
      ocr_error_char: false                                   # whether to open the augmentation method of simulating the OCR error for characters in the original texts. e.g. "I love LLM" --> "I 10ve LLM"
      delete_random_char: false                               # whether to open the augmentation method of deleting random characters from the original texts. e.g. "I love LLM" --> "I oe LLM"
      swap_random_char: false                                 # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "I love LLM" --> "I ovle LLM"
      insert_random_char: false                               # whether to open the augmentation method of inserting random characters into the original texts. e.g. "I love LLM" --> "I ^lKove LLM"
  - nlpcda_zh_mapper:                                       # simply augment texts in Chinese based on the nlpaug library
      sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
      aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated texts in the final datasets and the original texts will be removed. It's True in default.
      replace_similar_word: false                             # whether to open the augmentation method of replacing random words with their similar words in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这边一共有5种不同的数据增强方法"
      replace_homophone_char: false                           # whether to open the augmentation method of replacing random characters with their homophones in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的濖据增强方法"
      delete_random_char: false                               # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
      swap_random_char: false                                 # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
      replace_equivalent_num: false                           # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
  - optimize_instruction_mapper:                            # optimize instruction.
      hf_model: 'alibaba-pai/Qwen2-7B-Instruct-Refine'        # model name on huggingface to optimize instruction
      enable_vllm: true                                       # whether to use vllm for inference acceleration.
      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
  - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
  - remove_bibliography_mapper:                             # remove bibliography from Latex text.
  - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.
      doc_type: tex                                           # comment type you want to remove. Only support 'tex' for now.
      inline: true                                            # whether to remove inline comments
      multiline: true                                         # whether to remove multiline comments
  - remove_header_mapper:                                   # remove header texts from Latex text.
      drop_no_head: true                                      # whether to drop sample texts without headers
  - remove_long_words_mapper:                               # remove much too long words from text.
      min_len: 1                                              # the min word length to keep words.
      max_len: 128                                            # the max word length to keep words.
  - remove_non_chinese_character_mapper:                    # remove non-Chinese character in text samples.
      keep_alphabet: true                                     # whether to keep alphabet
      keep_number: true                                       # whether to keep number
      keep_punc: true                                         # whether to keep punctuation
  - remove_repeat_sentences_mapper:                         # remove repeat sentences in text samples.
      lowercase: false                                        # whether to convert sample text to lower case
      ignore_special_character: true                          # whether to ignore special characters when judging repeated sentences. Special characters are all characters except Chinese characters, letters and numbers
      min_repeat_sentence_length: 2                           # sentences shorter than this length will not be deduplicated. If ignore_special_character is set to True, then special characters are not included in this length
  - remove_specific_chars_mapper:                           # remove characters specified by users
      chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□'                        # a string or a list including those characters that need to be removed
  - remove_table_text_mapper:                               # remove possible table texts from text.
      min_col: 2                                              # the min num of columns in tables to remove
      max_col: 20                                             # the max num of columns in tables to remove
  - remove_words_with_incorrect_substrings_mapper:          # remove words with incorrect substrings from text.
      lang: en                                                # sample in which language
      tokenization: false                                     # whether to use model to tokenize documents
      substrings: ['http', 'www', '.com', 'href', '//']       # incorrect substrings to remove
  - sentence_split_mapper:                                  # split text to multiple sentences and join them with '\n'
      lang: 'en'                                              # split text in what language
  - video_captioning_from_audio_mapper:                     # caption a video according to its audio streams based on Qwen-Audio model
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only captioned sample in the final datasets and the original sample will be removed. It's True in default.
      mem_required: '30GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_captioning_from_frames_mapper:                    # generate samples whose captions are generated based on an image-to-text model and sampled video frames. Captions from different frames will be concatenated to a single string.
      hf_img2seq: 'Salesforce/blip2-opt-2.7b'                 # image-to-text model name on huggingface to generate caption
      caption_num: 1                                          # how many candidate captions to generate for each video
      keep_candidate_mode: 'random_any'                       # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
      prompt: null                                            # a string prompt to guide the generation of image-to-text model for all samples globally. It's None in default, which means no prompt provided.
      prompt_key: null                                        # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
      frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
      horizontal_flip: false                                  # flip frame image horizontally (left to right).
      vertical_flip: false                                    # flip frame image vertically (top to bottom).
      mem_required: '20GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_captioning_from_summarizer_mapper:                # generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...)
      hf_summarizer: 'mrm8488/flan-t5-large-finetuned-openai-summarize_from_feedback'  # the summarizer model used to summarize texts generated by other methods.
      consider_video_caption_from_video: true                 # whether to consider the video caption generated from video directly in the summarization process. Default: True.
      consider_video_caption_from_audio: true                 # whether to consider the video caption generated from audio streams in the video in the summarization process. Default: True.
      consider_video_caption_from_frames: true                # whether to consider the video caption generated from sampled frames from the video in the summarization process. Default: True.
      consider_video_tags_from_audio: true                    # whether to consider the video tags generated from audio streams in the video in the summarization process. Default: True.
      consider_video_tags_from_frames: true                   # whether to consider the video tags generated from sampled frames from the video in the summarization process. Default: True.
      vid_cap_from_vid_args: null                             # the arg dict for video captioning from video directly with keys are the arg names and values are the arg values. Default: None.
      vid_cap_from_frm_args: null                             # the arg dict for video captioning from sampled frames from the video with keys are the arg names and values are the arg values. Default: None.
      vid_tag_from_aud_args: null                             # the arg dict for video tagging from audio streams in the video with keys are the arg names and values are the arg values. Default: None.
      vid_tag_from_frm_args: null                             # the arg dict for video tagging from sampled frames from the video with keys are the arg names and values are the arg values. Default: None.
      keep_tag_num: 5                                         # max number N of tags from sampled frames to keep. Too many tags might bring negative influence to summarized text, so we consider to only keep the N most frequent tags. Default: 5.
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only summarized captions in the final datasets and the original captions will be removed. It's True in default.
      mem_required: '40GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_captioning_from_video_mapper:                     # generate captions by frame images extracted from video to augment datasets
      hf_video_blip: 'kpyu/video-blip-opt-2.7b-ego4d'         # video-blip model name on huggingface to generate caption
      caption_num: 1                                          # how many candidate captions to generate for each video
      keep_candidate_mode: 'random_any'                       # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
      prompt: null                                            # a string prompt to guide the generation of video-blip model for all samples globally. It's None in default, which means no prompt provided.
      prompt_key: null                                        # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
      frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
      horizontal_flip: false                                  # flip frame image horizontally (left to right).
      vertical_flip: false                                    # flip frame image vertically (top to bottom).
      mem_required: '20GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_face_blur_mapper:                                 # blur faces detected in videos
      cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
      blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
      radius: 2                                               # radius of blur kernel
  - video_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg video filters
  - video_remove_watermark_mapper:                          # Remove the watermarks in videos given regions
      roi_strings: ['0,0,0.1,0.1']                            # a given list of regions the watermarks locate. The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".
      roi_type: ratio                                         # the roi string type. When the type is 'pixel', (x1, y1), (x2, y2) are the locations of pixels in the top left corner and the bottom right corner respectively. If the roi_type is 'ratio', the coordinates are normalized by wights and heights.
      roi_key: null                                           # the key name of fields in samples to store roi_strings for each sample. It's used for set different rois for different samples.
      frame_num: 10                                           # the number of frames to be extracted uniformly from the video to detect the pixels of watermark.
      min_frame_threshold: 7                                  # a coodination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
      detection_method: pixel_value                           # the method to detect the pixels of watermark. If it is 'pixel_value', we consider the distribution of pixel value in each frame. If it is 'pixel_diversity', we will consider the pixel diversity in different frames.
  - video_resize_aspect_ratio_mapper:                       # resize videos aspect ratios of videos (a fraction of width by height, r=w/h) to a specified range
      min_ratio: 9/21                                         # the minimum aspect ratio to enforce videos with an aspect ratio below `min_ratio` will be resized to match this minimum ratio. The ratio should be provided as a string in the format "9:21" or "9/21".
      max_ratio: 21/9                                         # the maximum aspect ratio to enforce videos with an aspect ratio above `max_ratio` will be resized to match this maximum ratio. The ratio should be provided as a string in the format "21:9" or "21/9".
      strategy:  increase                                     # the resizing strategy to apply when adjusting the video dimensions. It can be either 'decrease' to reduce the dimension or 'increase' to enlarge it. Accepted values are ['decrease', 'increase'].
  - video_resize_resolution_mapper:                         # map videos to ones with given resolution range
      min_width: 640                                          # the min horizontal resolution (unit p), videos with width less than 'min_width' will be mapped to videos with equal or bigger width
      max_width: 1280                                         # the max horizontal resolution (unit p), videos with width more than 'max_width' will be mapped to videos with equal of smaller width
      min_height: 480                                         # the min vertical resolution (unit p), videos with height less than 'min_height' will be mapped to videos with equal or bigger height
      max_height: 1080                                        # the max vertical resolution (unit p), videos with height more than 'max_height' will be mapped to videos with equal or smaller height
      force_original_aspect_ratio: 'increase'                 # Enable decreasing or increasing output video width or height if necessary to keep the original aspect ratio
      force_divisible_by: 4                                   # Ensures that both the output dimensions, width and height, are divisible by the given integer when used together with force_original_aspect_ratio
  - video_split_by_duration_mapper:                         # Mapper to split video by duration.
      split_duration: 10                                      # duration of each video split in seconds.
      min_last_split_duration: 0.1                            # the minimum allowable duration in seconds for the last video split. If the duration of the last split is less than this value, it will be discarded.
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
  - video_split_by_key_frame_mapper:                        # Mapper to split video by key frame.
      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
  - video_split_by_scene_mapper:                            # split videos into scene clips
      detector: 'ContentDetector'                             # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`]
      threshold: 27.0                                         # threshold passed to the detector
      min_scene_len: 15                                       # minimum length of any scene
      show_progress: false                                    # whether to show progress from scenedetect
  - video_tagging_from_audio_mapper:                        # Mapper to generate video tags from audio streams extracted from the video.
      hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593'       # Huggingface model name for the audio classification model.
      tag_field_name: '__dj__video_audio_tags__'              # the field name to store the tags. It's "__dj__video_audio_tags__" in default.
      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_tagging_from_frames_mapper:                       # Mapper to generate video tags from frames extracted from the video.
      frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
      tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
  - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.

  # Filter ops
  - alphanumeric_filter:                                    # filter text with alphabet/numeric ratio out of specific range.
      tokenization: false                                     # whether to count the ratio of alphanumeric to the total number of tokens.
      min_ratio: 0.0                                          # the min ratio of filter range
      max_ratio: 0.9                                          # the max ratio of filter range
  - audio_duration_filter:                                  # keep data samples whose audios' durations are within a specified range.
      min_duration: 0                                         # the min audio duration of filter range (in seconds)
      max_duration: 3600                                      # the max audio duration of filter range (in seconds)
      any_or_all: any                                         # keep this sample when any/all audios meet the filter condition
  - audio_nmf_snr_filter:                                   # keep data samples whose audios' SNRs (computed based on NMF) are within a specified range.
      min_snr: 0                                              # the min audio SNR to keep samples in dB. It's 0 by default.
      max_snr: 1000                                           # the max audio SNR to keep samples in dB. It's sys.maxsize by default.
      nmf_iter_num: 500                                       # the max number of iterations to run NMF. It's 500 in default.
      any_or_all: any                                         # keep this sample when any/all audios meet the filter condition
  - audio_size_filter:                                      # keep data samples whose audios' sizes are within a specified range.
      min_duration: "0"                                       # the min audio size of filter range
      max_duration: "1TB"                                     # the max audio size of filter range
      any_or_all: any                                         # keep this sample when any/all audios meet the filter condition
  - average_line_length_filter:                             # filter text with the average length of lines out of specific range.
      min_len: 10                                             # the min length of filter range
      max_len: 10000                                          # the max length of filter range
  - character_repetition_filter:                            # filter text with the character repetition ratio out of specific range
      rep_len: 10                                             # repetition length for char-level n-gram
      min_ratio: 0.0                                          # the min ratio of filter range
      max_ratio: 0.5                                          # the max ratio of filter range
  - flagged_words_filter:                                   # filter text with the flagged-word ratio larger than a specific max value
      lang: en                                                # consider flagged words in what language
      tokenization: false                                     # whether to use model to tokenize documents
      max_ratio: 0.0045                                       # the max ratio to filter text
      flagged_words_dir: ./assets                             # directory to store flagged words dictionaries
      use_words_aug: false                                    # whether to augment words, especially for Chinese and Vietnamese
      words_aug_group_sizes: [2]                              # the group size of words to augment
      words_aug_join_char: ""                                 # the join char between words to augment
  - image_aesthetics_filter:                                # filter samples according to the aesthetics score of images.
      hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor
      min_score: 0.3                                          # the min aesthetics score of filter range
      max_score: 1.0                                          # the max aesthetics score of filter range
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - image_aspect_ratio_filter:                              # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
      min_ratio: 0.333                                        # the min aspect ratio of filter range
      max_ratio: 3.0                                          # the max aspect ratio of filter range
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
  - image_face_count_filter:                                # filter samples according to the face count in images
      cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
      min_face_count: 1                                       # the minimum number of faces required for samples.
      max_face_count: 1                                       # the maximum number of faces required for samples.
  - image_face_ratio_filter:                                # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one.
      cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
      min_ratio: 0.0                                          # the min face area ratio of filter range
      max_ratio: 0.4                                          # the max face area ratio of filter range
  - image_nsfw_filter:                                      # filter samples according to the nsfw scores of images in them
      hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification
      score_threshold: 0.5                                    # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - image_pair_similarity_filter:                           # filter samples according to the similarity score between the image pair.
      hf_clip: 'openai/clip-vit-base-patch32'                 # model name of the CLIP model on huggingface
      min_score: 0.1                                          # the min similarity score of filter range
      max_score: 1.0                                          # the max similarity score of filter range
      any_or_all: "any"                                       # keep this sample when any/all images meet the filter condition
  - image_shape_filter:                                     # filter samples according to the widths and heights of images in them
      min_width: 200                                          # the min width of width filter range
      max_width: 5000                                         # the max width of width filter range
      min_height: 200                                         # the min height of height filter range
      max_height: 5000                                        # the max height of height filter range
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
  - image_size_filter:                                      # filter samples according to the size of images (in bytes) within them
      min_size:  "0"                                          # the min size of filter range
      max_size: "1TB"                                         # the max size of filter range
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
  - image_text_matching_filter:                             # filter samples according to the matching score between image and text.
      hf_blip: Salesforce/blip-itm-base-coco                  # name of used Hugging Face blip
      min_score: 0.003                                        # the min matching score of filter range
      max_score: 1.0                                          # the max matching score of filter range
      horizontal_flip: false                                  # flip image horizontally (left to right).
      vertical_flip: false                                    # flip image vertically (top to bottom).
      reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - image_text_similarity_filter:                           # filter samples according to the similarity between image and text.
      hf_clip: openai/clip-vit-base-patch32                   # name of used Hugging Face clip
      min_score: 0.1                                          # the min similarity of filter range
      max_score: 1.0                                          # the max similarity of filter range
      horizontal_flip: false                                  # flip image horizontally (left to right).
      vertical_flip: false                                    # flip image vertically (top to bottom).
      reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - image_watermark_filter:                                 # filter samples according to the predicted watermark probabilities of images in them
      hf_watermark_model: amrul-hzz/watermark_detector        # Huggingface model name for watermark classification
      prob_threshold: 0.8                                     # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - language_id_score_filter:                               # filter text in specific language with language scores larger than a specific max value
      lang: en                                                # keep text in what language
      min_score: 0.8                                          # the min language scores to filter text
  - maximum_line_length_filter:                             # filter text with the maximum length of lines out of specific range
      min_len: 10                                             # the min length of filter range
      max_len: 10000                                          # the max length of filter range
  - perplexity_filter:                                      # filter text with perplexity score out of specific range
      lang: en                                                # compute perplexity in what language
      max_ppl: 1500                                           # the max perplexity score to filter text
  - phrase_grounding_recall_filter:                         # filter samples according to the locating recall of phrases extracted from text in the images.
      hf_owlvit: google/owlvit-base-patch32                   # name of used Hugging Face Owl-ViT
      min_recall: 0.1                                         # the min phrase grounding recall of filter range
      max_recall: 1.0                                         # the max phrase grounding recall of filter range
      horizontal_flip: false                                  # flip image horizontally (left to right).
      vertical_flip: false                                    # flip image vertically (top to bottom).
      iou_thr: 0.5                                            # the IoU threshold for NMS-like post-process
      large_area_ratio_thr: 0.95                              # the area ratio threshold for filtering out large predicted bboxes
      conf_thr: 0.0                                           # the confidence score threshold for removing low-confidence bboxes
      reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - special_characters_filter:                              # filter text with special-char ratio out of specific range
      min_ratio: 0.0                                          # the min ratio of filter range
      max_ratio: 0.25                                         # the max ratio of filter range
  - specified_field_filter:                                 # filter text with the specified field info out of specific range
      field_key: ''                                           # the target key corresponding to multi-level field information need to be separated by '.'
      target_value: []                                        # the range of specified field information corresponding to the samples that need to be retained
  - specified_numeric_field_filter:                         # filter text with the specified numeric field info out of specific range
      field_key: ''                                           # the target key corresponding to multi-level field information need to be separated by '.'
      min_value: 0                                            # the min filter value in SpecifiedNumericField op
      max_value: 10000                                        # the max filter value in SpecifiedNumericField op
  - stopwords_filter:                                       # filter text with stopword ratio smaller than a specific min value
      lang: en                                                # consider stopwords in what language
      tokenization: false                                     # whether to use model to tokenize documents
      min_ratio: 0.3                                          # the min ratio to filter text
      stopwords_dir: ./assets                                 # directory to store stopwords dictionaries
      use_words_aug: false                                    # whether to augment words, especially for Chinese and Vietnamese
      words_aug_group_sizes: [2]                              # the group size of words to augment
      words_aug_join_char: ""                                 # the join char between words to augment
  - suffix_filter:                                          # filter to keep samples with specified suffix.
      suffixes: []                                            # the suffix of text that will be keep. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
  - text_action_filter:                                     # filter text according the number of action verb
      lang: en                                                # consider the words in what language
      min_action_num: 1                                       # text will be filtered whose verbs less the min action number
  - text_entity_dependency_filter:                          # filter text without non-independent entity nouns
      lang: en                                                # consider the words in what language
      min_dependency_num: 1                                   # the min number of adjacent edges of a non-independent noun in dependency tree
      any_or_all: any                                         # keep this sample when any/all entity nouns are non-independent
  - text_length_filter:                                     # filter text with length out of specific range
      min_len: 10                                             # the min length of filter range
      max_len: 10000                                          # the max length of filter range
  - token_num_filter:                                       # filter text with total token number out of specific range
      hf_tokenizer: EleutherAI/pythia-6.9b-deduped            # name of used Hugging Face tokenizer
      min_num: 10                                             # the min number of filter range
      max_num: 10000                                          # the max number of filter range
  - video_aesthetics_filter:                                # filter samples according to the aesthetics score of frame images extracted from videos.
      hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor
      min_score: 0.3                                          # the min aesthetics score of filter range
      max_score: 1.0                                          # the max aesthetics score of filter range
      frame_sampling_method: 'uniform'                        # sampling method of extracting frame images from the videos. Should be one of ["all_keyframe", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "uniform" with frame_num=3, considering that the number of keyframes can be large while their difference is usually small in terms of their aesthetics.
      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
      reduce_mode: avg                                        # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_aspect_ratio_filter:                              # filter samples according to the aspect ratios of videos (a fraction of width by height, r=w/h) in them
      min_ratio: 9/21                                         # the minimum aspect ratio to keep samples, supported format is a string, such as "9:21" or "9/21".
      max_ratio: 21/9                                         # the maximum aspect ratio to keep samples, supported format is a string, such as "21:9" or "21/9".
      any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
  - video_duration_filter:                                  # Keep data samples whose videos' durations are within a specified range.
      min_duration: 0                                         # the min video duration of filter range (in seconds)
      max_duration: 10                                        # the max video duration of filter range (in seconds)
      any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
  - video_frames_text_similarity_filter:                    # keep samples those similarities between sampled video frame images and text within a specific range.
      hf_clip: openai/clip-vit-base-patch32                   # clip model name on huggingface to compute the similarity between frame image and text. It's kind of language-related. For example, for Chinese datasets, ChineseCLIP might be a better choice.
      min_score: 0.1                                          # the min similarity to keep samples.
      max_score: 1.0                                          # the max similarity to keep samples.
      frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
      horizontal_flip: false                                  # flip frame image horizontally (left to right).
      vertical_flip: false                                    # flip frame image vertically (top to bottom).
      reduce_mode: avg                                        # reduce mode when one text corresponds to multiple videos in a chunk,  must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_motion_score_filter:                              # Keep samples with video motion scores within a specific range.
      min_score: 0.25                                         # the minimum motion score to keep samples
      max_score: 10000.0                                      # the maximum motion score to keep samples
      sampling_fps: 2                                         # the samplig rate of frames_per_second to compute optical flow
      size: null                                              # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w)
      max_size: null                                          # maximum allowed for the longer edge of resized frames
      relative: false                                         # whether to normalize the optical flow magnitude to [0, 1], relative to the frame's diagonal length
      any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
  - video_nsfw_filter:                                      # filter samples according to the nsfw scores of videos in them
      hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification
      score_threshold: 0.5                                    # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
      frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
      reduce_mode: avg                                        # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_ocr_area_ratio_filter:                            # Keep data samples whose detected text area ratios for specified frames in the video are within a specified range.
      min_area_ratio: 0                                       # the min ocr area ratio to keep samples. It's 0 by default.
      max_area_ratio: 1.0                                     # the max ocr area ratio to keep samples. It's 1.0 by default.
      frame_sample_num: 3                                     # the number of sampled frames to calculate the ocr area ratio. If it's 1, only middle frame will be selected. If it's 2, only the first and the last frames will be selected. If it's larger than 2, in addition to the first and the last frames, other frames will be sampled evenly within the video duration.
      languages_to_detect: ['ch_sim', 'en']                   # texts in which languages should be detected. Default: ['ch_sim', 'en']. Full language list can be found here: https://www.jaided.ai/easyocr/.
      any_or_all: any                                         # keep this sample with 'any' or 'all' strategy of all videos. 'any': keep this sample if any videos meet the condition. 'all': keep this sample only if all videos meet the condition.
  - video_resolution_filter:                                # filter samples according to the resolution of videos in them
      min_width: 1280                                         # the min resolution of horizontal resolution filter range (unit p)
      max_width: 4096                                         # the max resolution of horizontal resolution filter range (unit p)
      min_height: 480                                         # the min resolution of vertical resolution filter range (unit p)
      max_height: 1080                                        # the max resolution of vertical resolution filter range (unit p)
      any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
  - video_watermark_filter:                                 # filter samples according to the predicted watermark probabilities of videos in them
      hf_watermark_model: amrul-hzz/watermark_detector        # Huggingface model name for watermark classification
      prob_threshold: 0.8                                     # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
      frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
      reduce_mode: avg                                        # reduce mode for multiple sampled video frames to compute final predicted watermark probabilities of videos, must be one of ['avg','max', 'min'].
      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
  - video_tagging_from_frames_filter:                       # filter samples according to the tags of videos in them
      tags: ['people']                                        # a tag list to shift the videos, total tags can be found in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt
      contain: any                                            # require the videos containing 'any' or 'all' given tags. When tags equal to [], 'all' keeps all samples, 'any' keeps no sample.
      frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
      tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
      any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
  - words_num_filter:                                       # filter text with number of words out of specific range
      lang: en                                                # sample in which language
      tokenization: false                                     # whether to use model to tokenize documents
      min_num: 10                                             # the min number of filter range
      max_num: 10000                                          # the max number of filter range
  - word_repetition_filter:                                 # filter text with the word repetition ratio out of specific range
      lang: en                                                # sample in which language
      tokenization: false                                     # whether to use model to tokenize documents
      rep_len: 10                                             # repetition length for word-level n-gram
      min_ratio: 0.0                                          # the min ratio of filter range
      max_ratio: 0.5                                          # the max ratio of filter range

  # Deduplicator ops
  - document_deduplicator:                                  # deduplicate text samples using md5 hashing exact matching method
      lowercase: false                                        # whether to convert text to lower case
      ignore_non_character: false                             # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
  - document_minhash_deduplicator:                          # deduplicate text samples using MinHash-LSH method
      tokenization: space                                     # tokenization method for text. One of [space, punctuation, character, sentencepiece]
      window_size: 5                                          # window size of shingling
      num_permutations: 256                                   # number of permutations in minhash computing
      jaccard_threshold: 0.7                                  # the min jaccard similarity threshold in near-duplicate detection. When the jaccard similarity of two sample texts is >= this threshold, they are regarded as similar samples and this op will only keep one of them after deduplication
      num_bands: null                                         # number of bands in LSH. Default it's None, and it will be determined by an optimal params computation algorithm by minimize the weighted sum of probs of False Positives and False Negatives
      num_rows_per_band: null                                 # number of rows in each band in LSH. Default it's None, and it will be determined by an optimal params computation algorithm
      lowercase: true                                         # whether to convert text to lower case
      ignore_pattern: null                                    # whether to ignore sub-strings with specific pattern when computing simhash.
      tokenizer_model: null                                   # path for the sentencepiece model, used for sentencepiece tokenization.
  - document_simhash_deduplicator:                          # deduplicate text samples using SimHash-LSH method
      tokenization: space                                     # tokenization method for text. One of [space, punctuation, character]
      window_size: 6                                          # window size of shingling
      num_blocks: 6                                           # number of blocks in SimHash computing
      hamming_distance: 4                                     # the max hamming distance to regard 2 samples as similar enough pair. Should be less than num_blocks always
      lowercase: true                                         # whether to convert text to lower case
      ignore_pattern: null                                    # whether to ignore sub-strings with specific pattern when computing simhash.
  - image_deduplicator:                                     # deduplicator to deduplicate samples at document-level using exact matching of images between documents.
      method: phash                                           # hash method for image. One of [phash, dhash, whash, ahash]
      consider_text: false                                    # whether to consider text hash together with image hash when applying deduplication.
  - video_deduplicator:                                     # deduplicator to deduplicate samples at document-level using exact matching of videos between documents.
      consider_text: false                                    # whether to consider text hash together with video hash when applying deduplication.
  - ray_video_deduplicator:                                 # the simple video deduplicator that can run on multi-nodes using md5 hashing exact matching method
      redis_host: 'redis_host'                              # the host of the redis instance
      redis_port: 6380                                      # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
  - ray_image_deduplicator:                                 # the simple image deduplicator that can deduplicate samples at document-level using exact matching of images between documents.
      redis_host: 'redis_host'                              # the host of the redis instance
      redis_port: 6380                                      # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
      method: phash                                         # hash method for image. One of [phash, dhash, whash, ahash]
  - ray_document_deduplicator:                              # the simple document deduplicator that can run on multi-nodes using md5 hashing exact matching method
      redis_host: 'redis_host'                              # the host of the redis instance
      redis_port: 6380                                      # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
      lowercase: false                                        # whether to convert text to lower case
      ignore_non_character: false                             # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations

  # Selector ops
  - frequency_specified_field_selector:                     # selector to select samples based on the sorted frequency of specified field value
      field_key: ''                                           # the target keys corresponding to multi-level field information need to be separated by '.'
      top_ratio:                                              # ratio of selected top specified field value
      topk:                                                   # number of selected top specified field value
      reverse: True                                           # determine the sorting rule, if reverse=True, then sort in descending order
  - random_selector:                                        # selector to random select samples
      select_ratio:                                           # the ratio to be sampled
      select_num:                                             # the number to be sampled
  - range_specified_field_selector:                         # selector to select a range of samples based on the sorted specified field value from smallest to largest.
      field_key: ''                                           # the target keys corresponding to multi-level field information need to be separated by '.'
      lower_percentile:                                       # the lower bound of the percentile to be sampled
      upper_percentile:                                       # the upper bound of the percentile to be sampled
      lower_rank:                                             # the lower rank of the percentile to be sampled
      upper_rank:                                             # the upper rank of the percentile to be sampled
  - topk_specified_field_selector:                          # selector to select top samples based on the sorted specified field
      field_key: ''                                           # the target keys corresponding to multi-level field information need to be separated by '.'
      top_ratio:                                              # ratio of selected top samples
      topk:                                                   # number of selected top sample
      reverse: True                                           # determine the sorting rule, if reverse=True, then sort in descending order