<a target="_blank" href="https://colab.research.google.com/github/okareo-ai/okareo-python-sdk/blob/main/examples/retrieval_eval.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Generate a retrieval scenario from your data!

Get your API token from [https://app.okareo.com/](https://app.okareo.com/) and set it in the cell below. 👇
   (Note: You will need to register first.)





In [None]:
OKAREO_API_KEY = "<YOUR-API-KEY>"

In [None]:
%pip install okareo
%pip install chromadb
%pip install pandas

**Load documents and relavant metadata into chromaDB**

In [None]:
import os
from io import StringIO 

import chromadb
import pandas as pd 

# Webbizz is an example web business. 
# We load 10 short summaries about different business aspects to the vector database.

chroma_client = chromadb.Client()
webbizz_articles = os.popen('curl https://raw.githubusercontent.com/okareo-ai/okareo-python-sdk/main/examples/webbizz_10_articles.jsonl').read()

collection = chroma_client.create_collection(name="retrieval_test", metadata={"hnsw:space": "cosine"})
jsonObj = pd.read_json(path_or_buf=StringIO(webbizz_articles), lines=True)

metadata_list = []
input_list = list(jsonObj.input)
for i in range(len(input_list)):
    if "sustainability" in input_list[i] or "security" in list(input_list[i]):
        metadata_list.append({"article_type": "Safety and sustainability"})
    elif "support" in input_list[i] or "help" in list(input_list[i]):
        metadata_list.append({"article_type": "Support"})
    elif "return" in input_list[i]:
        metadata_list.append({"article_type": "Return and exchange"})
    else:
        metadata_list.append({"article_type": "Miscellaneous"})

collection.add(
    documents=list(jsonObj.input),
    ids=list(jsonObj.result),
    metadatas=metadata_list
)

# chromadb specific handling
def query_results_to_score(results):
    parsed_ids_with_scores = []
    for i in range(0, len(results['distances'][0])):
        # this turns cosine distance into a cosine similarity score
        score = (2 - results['distances'][0][i]) / 2
        parsed_ids_with_scores.append(
            {
                "id": results['ids'][0][i],
                "score": score,
                "metadata": results['metadatas'][0][i],
                "label": f"{results['metadatas'][0][i]['article_type']} WebBizz Article w/ ID: {results['ids'][0][i]}"
            }
        )
    return parsed_ids_with_scores

**Generate retrieval questions from documents using our Text Reverse Question Generator**

In [None]:
import tempfile
from datetime import datetime

from okareo import Okareo
from okareo_api_client.models import TestRunType
from okareo.model_under_test import CustomModel
from okareo_api_client.models.generation_tone import GenerationTone
from okareo_api_client.models.scenario_set_create import ScenarioSetCreate
from okareo_api_client.models.scenario_set_generate import ScenarioSetGenerate
from okareo_api_client.models.scenario_type import ScenarioType
from okareo_api_client.models.seed_data import SeedData

okareo = Okareo(OKAREO_API_KEY)

# Create a scenario set of the documents from the chromaDB collection
seed_data = []
for id, document in zip(collection.get()['ids'], collection.get()['documents']):
    seed_data.append(SeedData(input_=document, result=[id]))

document_scenario = okareo.create_scenario_set(
    ScenarioSetCreate(
        name=f"WebBizz Documents {datetime.now().strftime('%m-%d %H:%M:%S')}", seed_data=seed_data
    )
)

# Use the scenario set of documents to generate a scenario of questions
generated_scenario = okareo.generate_scenario_set(
    ScenarioSetGenerate(
        name=f"Retrieval - Generated Scenario - {datetime.now().strftime('%m-%d %H:%M:%S')}",
        source_scenario_id=document_scenario.scenario_id,
        number_examples=1,
        generation_type=ScenarioType.TEXT_REVERSE_QUESTION,
        generation_tone=GenerationTone.INFORMAL
    )
)

# Print the generated questions for inspection
# for datapoint in okareo.get_scenario_data_points(generated_scenario.scenario_id):
#     print(f"Generated question: {datapoint.input}\n")

# Define a custom retrieval model
class RetrievalModel(CustomModel):
    def invoke(self, input: str):
        results = collection.query(
            query_texts=[input],
            n_results=5
        )
        # return a tuple of (parsed_ids_with_scores, overall model response context)
        return query_results_to_score(results), {'model_data': input}

# this will return a model if it already exists or create a new one if it doesn't
model_under_test = okareo.register_model(name="vectordb_retrieval_test", model=RetrievalModel(name="custom retrieval"))
at_k_intervals = [1, 2, 3, 4, 5] 

# Perform a test run using the generated scenario set
test_run_item = model_under_test.run_test(
    scenario=generated_scenario, # use the scenario from the scenario set uploaded in the previous step
    name=f"Retrieval Test Run {datetime.now().strftime('%m-%d %H:%M:%S')}", # name for test run
    test_run_type=TestRunType.INFORMATION_RETRIEVAL,
    calculate_metrics=True,
    metrics_kwargs={
        "accuracy_at_k": at_k_intervals ,
        "precision_recall_at_k": at_k_intervals ,
        "ndcg_at_k": at_k_intervals,
        "mrr_at_k": at_k_intervals,
        "map_at_k": at_k_intervals,
    }
)

# link back to Okareo site for evaluation visualization
model_results = test_run_item.model_metrics.to_dict()
app_link = test_run_item.app_link
print(f"See results in Okareo: {app_link}")