In [7]:
import os
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential

import openai
import pandas as pd

from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

from ragas.metrics import (
    Faithfulness,
    ContextPrecision,
    ContextRecall
)
from ragas.llms import LlamaIndexLLMWrapper
from ragas.embeddings import LlamaIndexEmbeddingsWrapper
from ragas.dataset_schema import SingleTurnSample, EvaluationDataset
from ragas.evaluation import evaluate
from ragas.run_config import RunConfig
from dotenv import load_dotenv

from openai import AzureOpenAI
from common import create_search_index,create_search_datasource,create_search_skillset, create_search_indexer



load_dotenv()

True

#### Prerequisites:
- Azure Subscription
- Azure AI Search Service
- Azure Storage Account with a container
- Azure OpenAI Service

#### Download sample dataset and upload it to the container in blob storage

In [2]:
rag_dataset, documents = download_llama_dataset(
    llama_dataset_class="PaulGrahamEssayDataset",
    download_dir="./data",
    show_progress=True
)
texts = [doc.text_resource.text for doc in documents]
df_ragas = rag_dataset.to_pandas()
df_ragas.sample(5)

100%|██████████| 1/1 [00:00<00:00,  2.30it/s]
Loading files: 100%|██████████| 1/1 [00:00<00:00,  1.39file/s]


Unnamed: 0,query,reference_contexts,reference_answer,reference_answer_by,query_by
36,"In the essay, the author discusses the develop...",[It was missing a lot of things you'd want in ...,The author developed a new programming languag...,ai (gpt-4),ai (gpt-4)
27,Paul Graham mentions a strategy he used for wr...,[It's not that unprestigious types of work are...,Paul Graham found that giving talks was an eff...,ai (gpt-4),ai (gpt-4)
43,Paul Graham mentions his experience of leaving...,[[18] The worst thing about leaving YC was not...,Paul Graham describes his experience of leavin...,ai (gpt-4),ai (gpt-4)
15,Describe the pivotal moment in the essay when ...,[Meanwhile I'd been hearing more and more abou...,The pivotal moment in the essay when the autho...,ai (gpt-4),ai (gpt-4)
2,"In the essay, the author discusses his initial...",[I couldn't have put this into words when I wa...,The two main influences that initially drew th...,ai (gpt-4),ai (gpt-4)


In [3]:
texts = [doc.text_resource.text for doc in documents]
file_path = "PaulGrahamEssayDataset.txt"
with open(file_path, "w", encoding="utf-8") as file:
    file.write(texts[0])

#### Azure Blob Storage Configuration

In [4]:
blob_container = os.environ["AZURE_BLOB_CONTAINER"]
blob_connection_string = os.environ["AZURE_BLOB_CONNECTION_STRING"]
blob_account_url = os.environ["AZURE_BLOB_ACCOUNT_URL"]

#### Upload sample data to blob storage

In [None]:
from azure.storage.blob import BlobServiceClient

def open_blob_client():
    if not blob_connection_string.startswith("ResourceId"):
        return BlobServiceClient.from_connection_string(
            blob_connection_string,
            max_block_size=1024*1024*8, # 8 MiB
            max_single_put_size=1024*1024*8 # 8 MiB
        )
    return BlobServiceClient(
        account_url=blob_account_url,
        credential=DefaultAzureCredential(),
        max_block_size=1024*1024*8, # 8 MiB
        max_single_put_size=1024*1024*8 # 8 MiB
    )

blob_client = open_blob_client()
container_client = blob_client.get_container_client(blob_container)
if not container_client.exists():
    container_client.create_container()

blob_name = os.path.basename(file_path)
blob_client = container_client.get_blob_client(blob_name)
if not blob_client.exists():
    with open(file_path, "rb") as f:
        blob_client.upload_blob(data=f, overwrite=True)

#### Azure OpenAI Configuration

In [5]:
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_type = "azure"
openai.api_version = os.getenv("OPENAI_API_VERSION")

azure_openai_embedding_deployment_id = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("OPENAI_API_KEY")

#### Azure AI Search Configuration

In [None]:
search_endpoint  = os.getenv('AZURE_SEARCH_SERVICE_ENDPOINT')
api_key = os.getenv('AZURE_SEARCH_API_KEY')
search_index  = os.getenv('AZURE_SEARCH_INDEX_NAME')
search_datasource = os.environ["AZURE_SEARCH_DATASOURCE"]
search_skillset = os.environ["AZURE_SEARCH_SKILLSET"]
search_indexer = os.environ["AZURE_SEARCH_INDEXER"]
search_datasource

In [None]:
search_index_client = SearchIndexClient(endpoint=search_endpoint , credential=AzureKeyCredential(api_key))

index = create_search_index(
    search_index ,
    openai.api_base,
    azure_openai_embedding_deployment_id,
    openai.api_key
)
search_index_client.create_or_update_index(index)


In [8]:
search_indexer_client = SearchIndexerClient(endpoint=search_endpoint , credential=AzureKeyCredential(api_key))

In [None]:
data_source = create_search_datasource(
    search_datasource,
    blob_connection_string,
    blob_container
)
search_indexer_client.create_or_update_data_source_connection(data_source)

In [None]:
skillset = create_search_skillset(
    search_skillset,
    search_index,
    azure_openai_endpoint,
    azure_openai_embedding_deployment_id,
    azure_openai_key,
    text_split_mode='pages',
    maximum_page_length=2000,
    page_overlap_length=500
)
search_indexer_client.create_or_update_skillset(skillset)

In [11]:
indexer = create_search_indexer(
    indexer_name=search_indexer,
    index_name=search_index,
    datasource_name=search_datasource,
    skillset_name=search_skillset
)
search_indexer_client.create_or_update_indexer(indexer)
search_indexer_client.run_indexer(search_indexer)

In [12]:
search_client = search_index_client.get_search_client(search_index)

#### Query the documents using Azure Open AI

In [None]:
results = []

queries = df_ragas['query'].tolist()

for query in queries:
    search_results = search_client.search(
            search_text=query,
            search_fields=["chunk"],  # Adjust to match your index schema
            select=["chunk"],  # Fields you want to retrieve
            top=5  # Limit the number of results
        )

    # Collect relevant contexts
    relevant_contexts = [result["chunk"] for result in search_results]

    combined_context = "\n".join(relevant_contexts)

    if not combined_context:
        print("No relevant contexts found.")
        combined_context = texts[0]

    prompt = (
    f"Sources:\n{combined_context}\n\n"
    "There is a list of queries and one source document. "
    "Answer each query using only the sources provided above. "
    "Structure your response for each query as follows:\n"
    "Query: [The query]\n"
    "Answer: [Your answer to the query].\n\n"
    )

    prompt += f"Query: {query}\n"

    # Send the batch query to OpenAI
    response = openai.chat.completions.create(
        model="gpt-4o",  # Use your Azure OpenAI deployment name
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=200  # Adjust as needed for large responses
    )

    response_content = response.choices[0].message.content

    answer = None
    answer = response_content.split("Answer:")[1].strip()

    results.append({
        "user_input": query,
        "retrieved_contexts": combined_context,
        "response": answer
    })

df = pd.DataFrame(results)
df['reference'] = df_ragas['reference_answer']


df

#### Evaluation

In [17]:
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

embed_model = AzureOpenAIEmbedding(
    model='text-embedding-3-small',
    api_key=os.environ['OPENAI_API_KEY'],
    api_version=os.environ['OPENAI_API_VERSION'],
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT']
)

llm = AzureOpenAI(
    engine="gpt-4o",
    model="gpt-4o",
    temperature=0.0,
    api_key=os.environ['OPENAI_API_KEY'],
    api_version=os.environ['OPENAI_API_VERSION'],
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT']
)

In [None]:
list_of_samples = []

# Iterate through the rows of the DataFrame
for _, row in df.iterrows():  # Use `iterrows()` to access rows in the DataFrame
    list_of_samples.append(
        SingleTurnSample(
            user_input=row["user_input"],  # Map user_input
            reference=row["reference"],  # Map reference
            response=row["response"],  # Map response
            retrieved_contexts=[row["retrieved_contexts"]]  # Wrap retrieved_contexts in a list
        )
    )

# Create an EvaluationDataset
ragas_evaluation_dataset = EvaluationDataset(list_of_samples)

# Convert the EvaluationDataset to a pandas DataFrame for sampling
ragas_evaluation_dataset.to_pandas().sample(5)

In [19]:
evaluator_llm = LlamaIndexLLMWrapper(llm)
evaluator_embeddings = LlamaIndexEmbeddingsWrapper(embed_model)

In [None]:
metrics = [
    Faithfulness(llm=evaluator_llm),
    ContextPrecision(llm=evaluator_llm),
    ContextRecall(llm=evaluator_llm)
]


ragas_evaluation_result = evaluate(
    dataset=ragas_evaluation_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    run_config=RunConfig(timeout=1800, max_wait=180, max_retries=20),
    show_progress=True,
    batch_size=20
)

In [None]:
df_ragas_result = ragas_evaluation_result.to_pandas()

# Removing NULL values
df_ragas_result = df_ragas_result[(
    ~df_ragas_result['faithfulness'].isnull()
)&(
    ~df_ragas_result['context_precision'].isnull()
)&(
    ~df_ragas_result['context_recall'].isnull()
)].reset_index(drop=True)

df_ragas_result

In [22]:
df_ragas_result.to_json('./test-dataset-azure.json', orient='records', indent=4)