## From Zero Instructions:

1. Install Okareo's Python SDK: &nbsp;&nbsp;  `pip install okareo`  &nbsp;&nbsp;  (just run the cell below)

2. Get your API token from [https://app.okareo.com/](https://app.okareo.com/).  
   (Note: You will need to register first.)

3. Go directly to the **"2. Create your API Token"** link on the landing page. You can skip all other steps.

4. Set the environment variable `OKAREO_API_KEY` to your generated API token.

In [None]:
%pip install okareo
%pip install chromadb
%pip install pandas

In [2]:
# Webbizz is an example web business. 
# We load 10 short summaries about different business aspects to the vector database.
import chromadb
import hashlib
import pandas as pd    
from chromadb import Documents, EmbeddingFunction, Embeddings
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        def average_pool(last_hidden_states: Tensor,
                        attention_mask: Tensor) -> Tensor:
            last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
            return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
        for i in range(len(input)):
            input[i] = 'query: ' + (input[i] if isinstance(input[i], str) else '')

        tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')
        model = AutoModel.from_pretrained('intfloat/e5-small-v2')

        # Tokenize the input texts
        batch_dict = tokenizer(input, max_length=512, padding=True, truncation=True, return_tensors='pt')

        outputs = model(**batch_dict)
        embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        return embeddings.tolist()
chroma_client = chromadb.PersistentClient(path="./e5/")

collection = chroma_client.get_collection(name="retrieval_testaa", embedding_function=MyEmbeddingFunction())
df = pd.read_csv('valid.csv')
print(df['query'].tolist())
print(len(df['query'].tolist()))
passages = df['finalpassage'].tolist()[:100]
ids = df['query'].tolist()[:100]
y = []
for i in df['query'].tolist():
    y.append(hashlib.md5(i.encode()).hexdigest())
# for i in range(0, 2):
#     collection.add(
#         documents=df['finalpassage'].tolist()[i*50:(i + 1)*50],
#         ids=y[i*50:(i + 1)*50]
#     )
#     print(i)


['how popular is the name conrad', 'disney hakuna matata meaning', 'what does emr stand for', "what is a dog's ruff", 'what is the part on your arm where they draw blood called', 'definition and explanation of singer songwriter', 'where can chikungunya be found', 'what is waterboarding', 'what is an organism that captures energy and stores it in food as chemical energy', 'whats the 3rd book in warroirs power of thre', 'how many hours in ohio before lunch break?', 'where is barker reservoir', "which of the heart's chambers is responsible for sending blood to the lungs to pick up oxygen?", 'definition of object code', 'how much money does a bank teller make', 'address of tube processing corp indianapolis, in', 'how do i copy a youtube video to my desktop', 'does caitlyn jenner date men or women', "obama care's cost", 'what is oceania considered', 'how many sq feet are in an acre', 'who is larry and why is he so happy', 'what is the gas called that they give you at the dentist', 'what age

In [5]:
# We have a set of 20 questions, each asking about different business aspects. 
# There is a specific summary that is the target answer for the question, which is linked by UUID.
# We upload these questions into a scenario set, so that they can be used in a test run in the next cell
import os
from okareo import Okareo
from okareo_api_client.models import ScenarioSetCreate, ScenarioSetResponse, SeedData, ScenarioType

OKAREO_API_KEY = os.environ["OKAREO_API_KEY"]
okareo = Okareo(OKAREO_API_KEY)
seed_data = []
for i in range(20):
    seed_data.append(SeedData(input_=ids[i], result=y[i]))
scenario_set_create = ScenarioSetCreate(
    name=f"e5",
    number_examples=1,
    generation_type=ScenarioType.SEED,
    seed_data=seed_data,
)
scenario = okareo.create_scenario_set(scenario_set_create)
print(scenario) # use scenario set id in output for the next step

ScenarioSetResponse(scenario_id='db711d5a-88dd-4cea-bb40-39446a541d2a', project_id='f7bfcff0-2b8d-4fec-ab20-51669cd3e732', time_created=datetime.datetime(2024, 1, 19, 18, 34, 38, 323849), type='SEED', tags=None, name='e5', seed_data=[SeedData(input_='how popular is the name conrad', result='b1a00f1cd425c3179d00f1b38856ec1b', additional_properties={}), SeedData(input_='disney hakuna matata meaning', result='0e3761ee3a325776555d9e7a8b99d44e', additional_properties={}), SeedData(input_='what does emr stand for', result='bd9feab3a7be3791be76ea4fd5fef655', additional_properties={}), SeedData(input_="what is a dog's ruff", result='8699d72efca687ca9b09f71dcaaa79b1', additional_properties={}), SeedData(input_='what is the part on your arm where they draw blood called', result='aad78e64fb1c4b5f76b0200c9f5727be', additional_properties={}), SeedData(input_='definition and explanation of singer songwriter', result='f1a815e1a5c53c379776f5ddb671e903', additional_properties={}), SeedData(input_='wher

In [7]:
# Perform a test run using a scenario set loaded in the previous cell 
from datetime import datetime
from okareo_api_client.models import TestRunType
from okareo.model_under_test import CustomModel
print('a')
def query_results_to_score(results):
    parsed_ids_with_scores = []
    for i in range(0, len(results['distances'][0])):
        score = (2 - results['distances'][0][i]) / 2 # chromadb specific handling, this turns cosine distance into a 0 to 1 cosine similarity score
        parsed_ids_with_scores.append((results['ids'][0][i], score))
    return parsed_ids_with_scores
print('a')

# Callable to be applied to each scenario in the scenario set
def call_model(input: str):
    results = collection.query(
        query_texts=[input],
        n_results=5
    )
    print(results)
    # return a tuple of (parsed_ids_with_scores, overall model response context)
    return query_results_to_score(results), {'model_data': input} 
print('a')

class RetrievalModel(CustomModel):
    def invoke(self, input: str):
        results = collection.query(
            query_texts=[input],
            n_results=5
        )
        # return a tuple of (parsed_ids_with_scores, overall model response context)
        return query_results_to_score(results), {'model_data': input}

# this will return a model if it already exists or create a new one if it doesn't
model_under_test = okareo.register_model(name="e5", model=RetrievalModel(name="e5 model"))
print('a')

test_run_item = model_under_test.run_test(scenario=scenario, # use the scenario from the scenario set uploaded in the previous step
                                          name=f"Retrieval Test Run e5 {datetime.now().strftime('%m-%d %H:%M:%S')}", # name for test run
                                          test_run_type=TestRunType.INFORMATION_RETRIEVAL,
                                          calculate_metrics=True)

# display model level metrics for the test run
print(f"See test run results: https://app.okareo.com/project/{test_run_item.project_id}/eval/{test_run_item.id}")

a
a
a
a
See test run results: https://app.okareo.com/project/f7bfcff0-2b8d-4fec-ab20-51669cd3e732/eval/6da6d8cb-c79f-4e99-a2ce-14c7cdc901ec
