<a target="_blank" href="https://colab.research.google.com/github/okareo-ai/okareo-python-sdk/blob/main/examples/retrieval_eval.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Welcome to Okareo!

Get your API token from [https://app.okareo.com/](https://app.okareo.com/) and set it in the cell below. 👇
   (Note: You will need to register first.)



In [1]:
import os
OKAREO_API_KEY = os.environ["OKAREO_API_KEY"]

In [None]:
%pip install okareo
%pip install cohere
%pip install pinecone
%pip install pandas

In [None]:
import os
import cohere
import pinecone
import pandas as pd
from io import StringIO

# Initialize Pinecone
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment='<YOUR-ENVIRONMENT>')
index_name = '<YOUR-INDEX>'

# Create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=4096, metric='cosine')

index = pinecone.Index(index_name)

# Load Webbizz articles
webbizz_articles = os.popen('curl https://raw.githubusercontent.com/okareo-ai/okareo-python-sdk/main/examples/webbizz_10_articles.jsonl').read()
jsonObj = pd.read_json(path_or_buf=StringIO(webbizz_articles), lines=True)

metadata_list = []
input_list = list(jsonObj.input)
for i in range(len(input_list)):
    if "sustainability" in input_list[i] or "security" in list(input_list[i]):
        metadata_list.append({"article_type": "Safety and sustainability"})
    elif "support" in input_list[i] or "help" in list(input_list[i]):
        metadata_list.append({"article_type": "Support"})
    elif "return" in input_list[i]:
        metadata_list.append({"article_type": "Return and exchange"})
    else:
        metadata_list.append({"article_type": "Miscellaneous"})

# Embed with Cohere and Add documents to Pinecone
cohere = cohere.Client()
data = []
for doc_id, doc, metadata in zip(jsonObj.result, jsonObj.input, metadata_list):
    response = cohere.embed(
        texts=[doc],
        input_type="search_document",
        model="embed-english-light-v3.0",
    )
    dense_vector = response.embeddings[0]

    vector_id = doc_id
    # Construct the data for upsert
    data.append({"id": vector_id, "values": dense_vector, "metadata": metadata})

# Upsert the data    
index.upsert(vectors=data)

In [None]:
import tempfile
import random
import string
from datetime import datetime

from okareo import Okareo
from okareo_api_client.models import TestRunType
from okareo.model_under_test import CohereModel, PineconeDb


# We have a set of 20 questions, each asking about different areas in Webbizz. 
# Each question has a specific summary that is the target answer for the question, which is linked by UUID.
# We upload these questions into a scenario set, so that they can be used in the retrieval evaluation

rnd = ''.join(random.sample(string.ascii_letters, 5))
okareo = Okareo(OKAREO_API_KEY)
webbizz_retrieval_questions = os.popen('curl https://raw.githubusercontent.com/okareo-ai/okareo-python-sdk/main/examples/webbizz_retrieval_questions.jsonl').read()
temp_dir = tempfile.gettempdir()
file_path = os.path.join(temp_dir, "webbizz_retrieval_questions.jsonl")
with open(file_path, "w+") as file:
    file.write(webbizz_retrieval_questions)
scenario = okareo.upload_scenario_set(file_path=file_path, scenario_name="Retrieval Articles Scenario")

# make sure to clean up tmp file
os.remove(file_path)


# this will return a model if it already exists or create a new one if it doesn't
model_under_test = okareo.register_model(
    name=f"pinecone_retrieval_test-{rnd}",
    model=[
        CohereModel(
            model_id="embed-english-light-v3.0",
            model_type="embed",
            input_type="search_query",
        ),
        PineconeDb(
            index_name="<YOUR-INDEX>",
            region="<YOUR-ENVIRONMENT>",
            project_id="<YOUR-PROJECT_ID>",
            top_k=5,
        ),
    ],
)

at_k_intervals = [1, 2, 3, 4, 5] 


# Perform a test run using the uploaded scenario set
test_run_item = model_under_test.run_test(
    scenario=scenario, # use the scenario from the scenario set uploaded in the previous step
    name=f"Pinecone Retrieval Test Run {datetime.now().strftime('%m-%d %H:%M:%S')}", # name for test run
    test_run_type=TestRunType.INFORMATION_RETRIEVAL,
    calculate_metrics=True,
    api_keys={
        "cohere": os.environ["COHERE_API_KEY"],
        "pinecone": os.environ["PINECONE_API_KEY"],
    },
    metrics_kwargs={
        "accuracy_at_k": at_k_intervals ,
        "precision_recall_at_k": at_k_intervals ,
        "ndcg_at_k": at_k_intervals,
        "mrr_at_k": at_k_intervals,
        "map_at_k": at_k_intervals,
    }
)

# link back to Okareo site for evaluation visualization
model_results = test_run_item.model_metrics.to_dict()
app_link = test_run_item.app_link
print(f"See results in Okareo: {app_link}")