# Data

This notebook is ingesting data into the storage container to be used for the indexers. We're using the OpenPaymentsData dataset which is available @ https://openpaymentsdata.cms.gov/. This data is made locally available as part of this example.

In [7]:
import os
from src.config import PARTITIONS, PARTITIONED_INDEX_NAME, BASELINE_INDEX_NAME
from src.data.ADLSGen2Loader import ADLSGen2Loader
from src.constants import OPENPAYMENTSDATA_FIELD_MAPPINGS, OPENPAYMENTSDATA_FIELDS
from src.data.DataSourceManagement import DataSourceManagement
from src.indexers.IndexerManagement import IndexerManagement
from src.indexes.IndexManagement import IndexManagement

data_dir = "data/openpaymentsdata"

In [8]:
data_dir = os.getcwd() + "/data/openpaymentsdata"
loader = ADLSGen2Loader(data_dir)
loader.upload_files()

Files uploaded successfully.


# Setup

These cells should initialize the setup on behalf of the indexes, data source mappings, and indexers to be created and run.

In [9]:
def create_resources():
    index_manager = IndexManagement()
    data_source_manager = DataSourceManagement()
    indexer_manager = IndexerManagement()

    index_manager.create_index(BASELINE_INDEX_NAME, OPENPAYMENTSDATA_FIELDS)  # this is the baseline source, to compare to.
    index_manager.create_index(PARTITIONED_INDEX_NAME, OPENPAYMENTSDATA_FIELDS)

    for i, partition in enumerate(PARTITIONS, start=1):
        data_source_name = f"{PARTITIONED_INDEX_NAME}-ds-{i}"
        indexer_name = f"{PARTITIONED_INDEX_NAME}-indexer-{i}"
        data_source_manager.create_data_source(data_source_name, partition)
        indexer_manager.create_indexer(indexer_name, data_source_name, OPENPAYMENTSDATA_FIELD_MAPPINGS)

    # create a single index to compare to
    baseline_data_source_name = f"{BASELINE_INDEX_NAME}-ds"
    data_source_manager.create_data_source(baseline_data_source_name, None)
    indexer_manager.create_indexer(f"{BASELINE_INDEX_NAME}-indexer", baseline_data_source_name, OPENPAYMENTSDATA_FIELD_MAPPINGS)

create_resources()

Index openpaymentsdata-baseline created successfully.
Index openpaymentsdata-partitioned created successfully.
Data source openpaymentsdata-partitioned-ds-1 created successfully.
Indexer openpaymentsdata-partitioned-indexer-1 created successfully.
Data source openpaymentsdata-partitioned-ds-2 created successfully.
Indexer openpaymentsdata-partitioned-indexer-2 created successfully.
Data source openpaymentsdata-partitioned-ds-3 created successfully.
Indexer openpaymentsdata-partitioned-indexer-3 created successfully.
Data source openpaymentsdata-partitioned-ds-4 created successfully.
Indexer openpaymentsdata-partitioned-indexer-4 created successfully.
Data source openpaymentsdata-partitioned-ds-5 created successfully.
Indexer openpaymentsdata-partitioned-indexer-5 created successfully.
Data source openpaymentsdata-partitioned-ds-6 created successfully.
Indexer openpaymentsdata-partitioned-indexer-6 created successfully.
Data source openpaymentsdata-partitioned-ds-7 created successfully.

# Reset Indexes & Run Performance

This step will reset the indexes in parallel to suggest the performance time it takes.

In [21]:
def reset_indexers():
    indexer_manager = IndexerManagement()

    for i in range(1, len(PARTITIONS) + 1):
        indexer_name = f"{PARTITIONED_INDEX_NAME}-indexer-{i}"
        indexer_manager.reset_indexer(indexer_name)
        indexer_manager.run_indexer(indexer_name)

reset_indexers()

Failed to reset indexer openpaymentsdata-partitioned-indexer-1: {"error":{"code":"","message":"Indexer 'openpaymentsdata-partitioned-indexer-1' was not found in service 'aisearch-zycz-premium'."}}
Failed to run indexer openpaymentsdata-partitioned-indexer-1: {"error":{"code":"","message":"Indexer 'openpaymentsdata-partitioned-indexer-1' was not found in service 'aisearch-zycz-premium'."}}
Failed to reset indexer openpaymentsdata-partitioned-indexer-2: {"error":{"code":"","message":"Indexer 'openpaymentsdata-partitioned-indexer-2' was not found in service 'aisearch-zycz-premium'."}}
Failed to run indexer openpaymentsdata-partitioned-indexer-2: {"error":{"code":"","message":"Indexer 'openpaymentsdata-partitioned-indexer-2' was not found in service 'aisearch-zycz-premium'."}}
Failed to reset indexer openpaymentsdata-partitioned-indexer-3: {"error":{"code":"","message":"Indexer 'openpaymentsdata-partitioned-indexer-3' was not found in service 'aisearch-zycz-premium'."}}
Failed to run index

If the below step fails, your Indexers might still be running. 

In [18]:
# Calculate total run time
indexer_manager = IndexerManagement()
partitioned_index_time = indexer_manager.calculate_total_run_time()
baseline_index_time = indexer_manager.get_indexer_run_time(f"{BASELINE_INDEX_NAME}-indexer")

print(f"Partitioned index took {partitioned_index_time} seconds to run.")
print(f"Baseline index took {baseline_index_time} seconds to run.")
print(f"Difference in execution, partitioned index took {partitioned_index_time - baseline_index_time} seconds faster to run.")

TypeError: strptime() argument 1 must be str, not None

# Cleanup

Tasks to clean up information from AI Search, as required!

In [6]:
def delete_resources():
    index_manager = IndexManagement()
    data_source_manager = DataSourceManagement()
    indexer_manager = IndexerManagement()

    index_manager.delete_index(PARTITIONED_INDEX_NAME)

    for i in range(1, len(PARTITIONS) + 1):
        data_source_name = f"{PARTITIONED_INDEX_NAME}-ds-{i}"
        indexer_name = f"{PARTITIONED_INDEX_NAME}-indexer-{i}"
        data_source_manager.delete_data_source(data_source_name)
        indexer_manager.delete_indexer(indexer_name)

    baseline_data_source_name = f"{BASELINE_INDEX_NAME}-ds"
    data_source_manager.delete_data_source(baseline_data_source_name)
    indexer_manager.delete_indexer(f"{BASELINE_INDEX_NAME}-indexer")

    loader.delete_partition_folders()

delete_resources()

Failed to delete index openpaymentsdata-partitioned: {"error":{"code":"OperationNotAllowed","message":"No index with the name 'openpaymentsdata-partitioned' was found in a service named 'aisearch-zycz-premium'.","details":[{"code":"IndexNotFoundInService","message":"No index with the name 'openpaymentsdata-partitioned' was found in a service named 'aisearch-zycz-premium'."}]}}
Failed to delete data source openpaymentsdata-partitioned-ds-1: {"error":{"code":"","message":"No datasource with the name 'openpaymentsdata-partitioned-ds-1' was found in a service named 'aisearch-zycz-premium'."}}
Failed to delete indexer openpaymentsdata-partitioned-indexer-1: {"error":{"code":"","message":"Indexer 'openpaymentsdata-partitioned-indexer-1' was not found in service 'aisearch-zycz-premium'."}}
Failed to delete data source openpaymentsdata-partitioned-ds-2: {"error":{"code":"","message":"No datasource with the name 'openpaymentsdata-partitioned-ds-2' was found in a service named 'aisearch-zycz-pre

NameError: name 'loader' is not defined