<a target="_blank" href="https://colab.research.google.com/github/okareo-ai/okareo-python-sdk/blob/main/examples/retrieval_eval.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Welcome to Okareo!

Get your API token from [https://app.okareo.com/](https://app.okareo.com/) and set it in the cell below. 👇
   (Note: You will need to register first.)



In [None]:
OKAREO_API_KEY = "<YOUR-API-KEY>"

In [None]:
%pip install okareo
%pip install chromadb
%pip install pandas

In [None]:
import os
from io import StringIO  

import chromadb
import pandas as pd 

# Webbizz is an example web business. 
# We load 10 short summaries about different business aspects to the vector database.

chroma_client = chromadb.Client()
webbizz_articles = os.popen('curl https://raw.githubusercontent.com/okareo-ai/okareo-python-sdk/main/examples/webbizz_10_articles.jsonl').read()

collection = chroma_client.create_collection(name="retrieval_test", metadata={"hnsw:space": "cosine"})
jsonObj = pd.read_json(path_or_buf=StringIO(webbizz_articles), lines=True)
collection.add(
    documents=list(jsonObj.input),
    ids=list(jsonObj.result)
)

# chromadb specific handling
def query_results_to_score(results):
    parsed_ids_with_scores = []
    for i in range(0, len(results['distances'][0])):
        # this turns cosine distance into a cosine similarity score
        score = (2 - results['distances'][0][i]) / 2
        parsed_ids_with_scores.append((results['ids'][0][i], score))
    return parsed_ids_with_scores

In [None]:
import tempfile
from datetime import datetime

from okareo import Okareo
from okareo_api_client.models import TestRunType
from okareo.model_under_test import CustomModel

# We have a set of 20 questions, each asking about different areas in Webbizz. 
# Each question has a specific summary that is the target answer for the question, which is linked by UUID.
# We upload these questions into a scenario set, so that they can be used in the retrieval evaluation

okareo = Okareo(OKAREO_API_KEY)
webbizz_retrieval_questions = os.popen('curl https://raw.githubusercontent.com/okareo-ai/okareo-python-sdk/main/examples/webbizz_retrieval_questions.jsonl').read()
temp_dir = tempfile.gettempdir()
file_path = os.path.join(temp_dir, "webbizz_retrieval_questions.jsonl")
with open(file_path, "w+") as file:
    file.write(webbizz_retrieval_questions)
scenario = okareo.upload_scenario_set(file_path=file_path, scenario_name="Retrieval Articles Scenario")

# make sure to clean up tmp file
os.remove(file_path)

class RetrievalModel(CustomModel):
    def invoke(self, input: str):
        results = collection.query(
            query_texts=[input],
            n_results=5
        )
        # return a tuple of (parsed_ids_with_scores, overall model response context)
        return query_results_to_score(results), {'model_data': input}

# this will return a model if it already exists or create a new one if it doesn't
model_under_test = okareo.register_model(name="vectordb_retrieval_test", model=RetrievalModel(name="custom retrieval"))

# Perform a test run using the uploaded scenario set
test_run_item = model_under_test.run_test(scenario=scenario, # use the scenario from the scenario set uploaded in the previous step
                                          name=f"Retrieval Test Run {datetime.now().strftime('%m-%d %H:%M:%S')}", # name for test run
                                          test_run_type=TestRunType.INFORMATION_RETRIEVAL,
                                          calculate_metrics=True)

# link back to Okareo site for evaluation visualization
model_results = test_run_item.model_metrics.to_dict()
app_link = test_run_item.app_link
print(f"See results in Okareo: {app_link}")