In [None]:
%pip install azure-ai-ml
%pip install azureml-core
%pip install azure-identity
%pip install azureml-rag
%pip install azureml.fsspec
%pip install pandas
%pip install openai~=0.27.8 # versioning for to allow dataplane deployment inferring

In [None]:
# If `import win32file` fails with a DLL error then run the following and restart kernel:
# %pip uninstall -y pywin32
# %conda install -y --force-reinstall pywin32

# QA Data Generation

QA Data Generation is a part of RAG (Retrieval Augemented Generation) creation process where the autogenerated QA dataset is used:

1. To get the best prompt for RAG
2. To get evaluation metrics for RAG

This notebook shows you how to create a QA dataset from your data (Git repo). We run just the components needed for QA Data Generation and not for the full RAG creation flow.

## Get client for AzureML Workspace

The workspace is the top-level resource for Azure Machine Learning, providing a centralized place to work with all the artifacts you create when you use Azure Machine Learning. In this section we will connect to the workspace in which the job will be run.

In [None]:
## User Inputs
subscription_id = ""
resource_group = ""
workspace_name = ""
openai_connection_subscription_id = ""
openai_connection_resource_group = ""
openai_connection_workspace_name = ""

# Defaults
registry_name = "azureml-preview"

In [None]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient
from azureml.core import Workspace

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential does not work
    credential = InteractiveBrowserCredential()

try:
    ml_client = MLClient.from_config(credential=credential, path='workspace.json')
except Exception as ex:    
    ml_client = MLClient(
        credential=credential,
        subscription_id=subscription_id,
        resource_group_name=resource_group,
        workspace_name=workspace_name
    )
ws = Workspace(subscription_id=ml_client.subscription_id, resource_group=ml_client.resource_group_name, workspace_name=ml_client.workspace_name)
openai_ws = Workspace(subscription_id=openai_connection_subscription_id, resource_group=openai_connection_resource_group, workspace_name=openai_connection_workspace_name)
print(ml_client)

## Azure OpenAI

We recommend using gpt-35-turbo model to get good quality QAs. [Follow these instructions](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) to setup an Azure OpenAI Instance and deploy the model. Once you have the model deployed in AOAI you can specify your Model name and Deployment name below.

In [None]:
aoai_completion_model_name = 'gpt-35-turbo'
aoai_completion_deployment_name = 'gpt35turbo'
aoai_embedding_model_name = 'text-embedding-ada-002'
aoai_embedding_deployment_name = 'test'
aoai_connection = "christest123"

In [None]:
from azureml.rag.utils.connections import get_connection_by_name_v2, create_connection_v2

try:
    aoai_connection = get_connection_by_name_v2(openai_ws, aoai_connection)
    aoai_connection_id = aoai_connection['id']
except Exception as ex:
    print(f"Could not get connection '{aoai_connection}', creating a new one")

    target = '<target>' # example: 'https://<endpoint>.openai.azure.com/'
    key = '<key>'
    apiVersion = '<api_version>' # 2023-03-15-preview
    
    if(key == '<key>'):
        raise RuntimeError(f"Please provide a valid key for the Azure OpenAI service")
    if(target == '<target>'):  
        raise RuntimeError(f"Please provide a valid target for the Azure OpenAI service")
    if(apiVersion == '<api_version>'):
        raise RuntimeError(f"Please provide a valid api-version for the Azure OpenAI service")
    aoai_connection_id = create_connection_v2(
        workspace=openai_ws,
        name=aoai_connection,
        category='AzureOpenAI',
        target=target,
        auth_type='ApiKey',
        credentials={
            'key': key
        },
        metadata={
            'apiType': 'azure',
            'apiVersion': apiVersion
        }
    )['id']

In [None]:
# Uncomment to upgrade azureml-rag if infer_deployment is unrecognized in the package
# %pip install azureml-rag --upgrade

from azureml.rag.utils.deployment import infer_deployment

aoai_completion_deployment_name = infer_deployment(aoai_connection, aoai_completion_model_name)
print(f"Deployment name in AOAI workspace for model '{aoai_completion_model_name}' is '{aoai_completion_deployment_name}'")

### Setup Pipeline

In [None]:
ml_registry = MLClient(credential=credential, registry_name = registry_name)

# validate_deployments_component = ml_registry.components.get('llm_rag_validate_deployments', label='latest')
# git_clone_component = ml_registry.components.get('llm_rag_git_clone', label='latest')
# crack_and_chunk_component = ml_registry.components.get('llm_rag_crack_and_chunk', label='latest')
# data_generation_component = ml_registry.components.get('llm_rag_qa_data_generation', label='latest')
git_to_faiss_component = ml_registry.components.get('llm_ingest_git_to_faiss_basic', label='latest')

In [None]:
from azure.ai.ml import Output
from azure.ai.ml.dsl import pipeline

def use_automatic_compute(component, instance_count=1, instance_type='Standard_D2s_v3'):
    component.set_resources(instance_count=instance_count, instance_type=instance_type, properties={'compute_specification': {'automatic': True}})
    return component

def use_aoai_connection(component, aoai_connection_id, custom_env:str=None):
    if custom_env is not None:
        component.environment_variables[custom_env] = aoai_connection_id  
    if aoai_connection_id is not None:
        component.environment_variables['AZUREML_WORKSPACE_CONNECTION_ID_AOAI'] = aoai_connection_id

# @pipeline(compute=dedicated_cpu_compute)
@pipeline(default_compute='serverless')
def qa_faiss_index_generation(
    git_url,
    data_source_url,
    llm_completion_config,
    embeddings_model,
    aoai_connection_id=None,
    chunk_size=1024,
    chunk_overlap=0,
    chunk_prepend_summary=False,
):
    # validate_deployments = validate_deployments_component(
    #     llm_config = llm_completion_config,
    #     check_completion = "True",
    #     check_embeddings = "False"
    # )
    # use_automatic_compute(validate_deployments)
    # use_aoai_connection(validate_deployments, aoai_connection_id, custom_env='AZUREML_WORKSPACE_CONNECTION_ID_AOAI_COMPLETION')

    # git_clone = git_clone_component(
    #     git_repository=git_url,
    #     branch_name=branch_name
    # )
    # use_automatic_compute(git_clone)    

    # crack_and_chunk = crack_and_chunk_component(
    #     input_data=git_clone.outputs.output_data,
    #     input_glob=data_source_glob,
    #     chunk_size=1024,
    #     data_source_url=data_source_url,
    #     document_path_replacement_regex=document_path_replacement_regex
    # )
    # use_automatic_compute(crack_and_chunk)

    #  # QA Data Generation
    # data_generation = data_generation_component(
    #     input_data = crack_and_chunk.outputs.output_chunks,
    #     deployment_validation = validate_deployments.outputs.output_data,
    #     llm_config = llm_completion_config,
    #     dataset_size = 10,  # Number of QAs to be generated
    #     dataset_name = asset_name,
    #     register_output =  True
    # )
    # use_automatic_compute(data_generation)
    # use_aoai_connection(data_generation, aoai_connection_id)

    # Ingest Git to Faiss Vector Index
    git_to_faiss = git_to_faiss_component(
        git_repository = git_url,
        data_source_url = data_source_url,
        llm_config = llm_completion_config,
        llm_connection = aoai_connection_id,
        embeddings_model = embeddings_model,
        embedding_connection = aoai_connection_id,
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        chunk_prepend_summary = chunk_prepend_summary
    )
    use_automatic_compute(git_to_faiss)


    return {
        'qa_faiss_index': git_to_faiss.outputs.faiss_index,
    }

In [None]:
# User Inputs
git_url = 'https://github.com/microsoft/ml-wrappers'
data_source_url = 'https://github.com/microsoft/ml-wrappers'
chunk_size = "1024"
chunk_overlap = "0"
chunk_prepend_summary = False
#data_source_glob = 'articles/machine-learning/**/*'

## This regex is used to remove the 'articles' folder from the source url put in each files metadata in the index.
#document_path_replacement_regex = r'{"match_pattern": "(.*)/articles/(.*)(\\.[^.]+)$", "replacement_pattern": "\\1/\\2"}'
#asset_name = 'qa_data'
experiment_name = 'qa_faiss_index_generation'

# Defaults
embeddings_model = f'{{azure_open_ai://deployment/{aoai_embedding_deployment_name}/model/{aoai_embedding_model_name}}}'
llm_completion_config = f'{{"type":"azure_open_ai","model_name":"{aoai_completion_model_name}","deployment_name":"{aoai_completion_deployment_name}","temperature":"0","max_tokens":"2000"}}'

In [None]:
from azure.ai.ml import Input
from azure.ai.ml.entities import UserIdentityConfiguration

# data_source_glob=data_source_glob,
# asset_name=asset_name,
# document_path_replacement_regex=document_path_replacement_regex,
pipeline_job = qa_faiss_index_generation(
    git_url = git_url,
    data_source_url = data_source_url,
    llm_completion_config = llm_completion_config,
    embeddings_model = embeddings_model,
    aoai_connection_id=aoai_connection_id,
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap,
    chunk_prepend_summary = chunk_prepend_summary,
)

pipeline_job.identity = UserIdentityConfiguration()
pipeline_job.settings.continue_on_step_failure = False

# pipeline_job.settings.force_rerun = True # Rerun each time so that git_clone isn't cached, if intent is to ingest latest data.

### Submit Pipeline
Click on the generated link below access the job details on studio. Make sure all necessary flights are added on the URL to access these preview features.

**In case of any errors see [TROUBLESHOOT.md](../../TROUBLESHOOT.md).**

In [None]:
running_pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name=experiment_name
)
running_pipeline_job

### Review generated QA data

In [None]:
# import fsspec
# import pandas as pd

# qa_data = ml_client.data.get(f"{asset_name}-test-data", label='latest')
# with fsspec.open(qa_data.path) as f:
#     df = pd.read_json(f, lines=True)
# df.head(2)

In [None]:
# for qa_type in ["TOPIC", "FACTUAL", "BOOLEAN"]:
#     print(f"{qa_type} Question Answers:")
#     for _, row in df[df["qaType"] == qa_type][:2].iterrows():
#         print("Q:", row["question"])
#         print("A:", row["answer"])
#         print()

### Review token usage

In [None]:
# running_pipeline_job = ml_client.jobs.get("<pipeline run id>")
child_runs = ml_client.jobs.list(parent_job_name=running_pipeline_job.name)
child_runs = list(child_runs)
data_generation_run = child_runs[-1]

In [None]:
from azureml.core import Run

run = Run.get(ws, data_generation_run.name)
metrics = run.get_metrics()

In [None]:
print(f"Tokens used: {metrics['total_tokens']}")
print(f"Model used: {metrics['llm_model_name']}")

Given the token usage and the model you can compute cost using the pricing here: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/.