In [1]:
# read config.json
import copy
import json
import os
import subprocess
import tqdm
from openai import AzureOpenAI
from dotenv import load_dotenv

load_dotenv()  

True

In [2]:
# list out all data
os.listdir("../../data/gptassertdata/index_data")

['aks',
 'azure-docs',
 'Mercedes',
 'merge_contact',
 'nyc',
 'pdf-testing',
 'Premera',
 'product-info',
 'test_folder_sophie',
 'test_loranorm',
 'tprompt',
 'yw']

In [4]:
with open("./config.json", "r") as f:
    config = json.loads(f.read())

config

[{'data_path': '<path to data>',
  'location': "<azure region, e.g. 'westus2'>",
  'subscription_id': '<subscription id>',
  'resource_group': '<resource group name>',
  'search_service_name': '<search service name to use or create>',
  'index_name': '<index name to use or create>',
  'chunk_size': 1024,
  'token_overlap': 128,
  'semantic_config_name': 'default',
  'language': "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"}]

In [4]:
FORM_RECOGNIZER_KEY = os.getenv("FORM_RECOGNIZER_KEY")

In [10]:
run_config_by_data_path_3_small_512_512 = {
    "aks": "aks_embed_003_small_512_512_index",
    "azure-docs": {
        "index": "azure_embed_003_small_512_512_index",
        "subfolder": "azure-docs",
    },
    "Mercedes": "mercedes_embed_003_small_512_512_index",
    "merge_contact": {
        "index": "merge_contact_embed_003_small_512_512_index",
        "form-rec-use-layout": False,
    },
    "nyc": "nyc_embed_003_small_512_512_index",
    "Premera": "premera_embed_003_small_512_512_index",
    "product-info": {
        "index": "product_info_embed_003_small_512_512_index",
        "subfolder": "product-info",
    },
    "test_loranorm": {
        "index": "test_loranorm_embed_003_small_512_512_index",
        "form-rec-use-layout": False,
    },
    "tprompt": {
        "index": "tprompt_embed_003_small_512_512_index",
        "subfolder": "source",
    },
    "yw": "yw_embed_003_small_512_512_index",
    
}

In [12]:
for key, cfg in tqdm.tqdm(run_config_by_data_path_3_small_512_512.items()):
    folder = os.path.join("../../data/gptassertdata/index_data", key)
    
    if isinstance(cfg, str):
        index = cfg
        form_rec_use_layout = True
    else:
        index = cfg["index"]
        form_rec_use_layout = cfg.get("form-rec-use-layout", True)
        if "subfolder" in cfg:
            folder = os.path.join(folder, cfg["subfolder"])


    config_key = copy.deepcopy(config[0])
    config_key["data_path"] = os.path.abspath(folder)
    config_key["index_name"] = index

    print(config_key["data_path"])
    with open(f"./config.{key}.json", "w") as f:
        f.write(json.dumps([config_key]))
    
    command = [
        "python",
        "data_preparation.py",
        "--config",
        f"config.{key}.json",
        "--embedding-model-endpoint",
        '"https://wed-aiq-aoai-eus.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-02-15-preview"',
        "--form-rec-resource",
        "test-tprompt",
        "--form-rec-key",
        FORM_RECOGNIZER_KEY,
    ] + (["--form-rec-use-layout"] if form_rec_use_layout else []) + [
        "--njobs=8",
    ]
    str_command = " ".join(command)
    # print(str_command)
    # continue
    proc = subprocess.run(str_command, capture_output=True)
    if proc.returncode != 0:
        print("Error running", command)
        print(proc.stderr)
        print(proc.stdout)

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\aks


 10%|███████▎                                                                 | 1/10 [11:39:42<104:57:24, 41982.78s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\azure-docs\azure-docs


 20%|██████████████▊                                                           | 2/10 [20:59:15<82:18:03, 37035.43s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\Mercedes


 30%|██████████████████████▏                                                   | 3/10 [21:08:49<39:38:27, 20386.80s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\merge_contact


 40%|█████████████████████████████▌                                            | 4/10 [21:13:17<20:44:23, 12443.88s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\nyc


 50%|█████████████████████████████████████▌                                     | 5/10 [21:23:05<11:20:42, 8168.58s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\Premera


 60%|█████████████████████████████████████████████▌                              | 6/10 [21:37:55<6:19:35, 5693.97s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\product-info\product-info


 70%|█████████████████████████████████████████████████████▏                      | 7/10 [21:42:41<3:16:17, 3925.94s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\test_loranorm


 80%|████████████████████████████████████████████████████████████▊               | 8/10 [21:47:12<1:32:04, 2762.41s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\tprompt\source


 90%|██████████████████████████████████████████████████████████████████████▏       | 9/10 [21:52:17<33:14, 1994.26s/it]

C:\Users\sophiechen.REDMOND\OneDrive - Microsoft\Documents\oyd\data\gptassertdata\index_data\yw


100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [21:57:17<00:00, 7903.76s/it]
