## Sparse vs dense vector comparison for retrieval:

1. Install Okareo's Python SDK: &nbsp;&nbsp;  `pip install okareo`  &nbsp;&nbsp; 

2. Get your API token from [https://app.okareo.com/](https://app.okareo.com/).  
   (Note: You will need to register first.)

3. Go directly to the API settings by clicking the button under **"1. Create API Token"**. You can skip all other steps.

4. Add your generated API token to the cell below. 👇

In [None]:
OKAREO_API_KEY = "<YOUR-API-KEY>"

In [None]:
%pip install okareo
%pip install chromadb
%pip install pandas
%pip install torch
%pip install transformers

**Create vector database with SPLADE embeddings. This can take around 3 to 7 minutes depending on the device**

In [None]:
import math
import chromadb
import pandas as pd
from transformers import AutoModelForMaskedLM, AutoTokenizer
from chromadb import Documents, EmbeddingFunction, Embeddings
import torch
from torch import Tensor
import hashlib
from transformers import AutoTokenizer, AutoModel

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        model_id = 'naver/splade-cocondenser-ensembledistil'
        mx = 0
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForMaskedLM.from_pretrained(model_id)
        all_embeds = []
        for doc in input:
            tokens = tokenizer(doc, return_tensors='pt')
            output = model(**tokens)
            vec = torch.max(
                torch.log(
                    1 + torch.relu(output.logits)
                ) * tokens.attention_mask.unsqueeze(-1),
            dim=1)[0].squeeze()
            cols = vec.nonzero().squeeze().cpu().tolist()

            weights = vec[cols].cpu().tolist()
            embed_arr = [0] * 30000
            for i in range(len(cols)):
                embed_arr[cols[i]] = weights[i]
                if cols[i] > mx:
                    mx = cols[i]
            all_embeds.append(embed_arr)
        return all_embeds

chroma_client = chromadb.PersistentClient(path="./splade/")

collection = chroma_client.create_collection(name="chromadb", metadata={"hnsw:space": "cosine"}, embedding_function=MyEmbeddingFunction())
# To run this cell again after embedding documents, comment out the line above and uncomment out the line below.
# collection = chroma_client.get_collection(name="chromadb", embedding_function=MyEmbeddingFunction())
# If you would like to re-embed the doucments, delete the splade folder and restart the notebook
df = pd.read_csv('ms_marco_dev.csv')

rowsToEncode = 2000
passages = df['finalpassage'].tolist()[:rowsToEncode]
queries = df['query'].tolist()[:rowsToEncode]
ids = []
for i in df['query'].tolist():
    ids.append(hashlib.md5(i.encode()).hexdigest())

# Comment out the next 5 lines when you have already embedded the documents
for i in range(0, math.floor(rowsToEncode / 50)):
    collection.add(
        documents=df['finalpassage'].tolist()[i*50:(i + 1)*50],
        ids=ids[i*50:(i + 1)*50]
    )

**Create scenario to test retrieval from the vector database**

In [None]:
from okareo import Okareo
from okareo_api_client.models import ScenarioSetCreate, ScenarioSetResponse, SeedData, ScenarioType

okareo = Okareo(OKAREO_API_KEY)
seed_data = []
for i in range(0, rowsToEncode, 10):
    seed_data.append(SeedData(input_=queries[i], result=[ids[i]]))
scenario_set_create = ScenarioSetCreate(
    name="Embedding scenarios",
    seed_data=seed_data,
)
scenario = okareo.create_scenario_set(scenario_set_create)

**Run the scenario against the SPLADE model and calcuate retrieval metrics**

In [None]:
# Perform a test run using a scenario set loaded in the previous cell 
from datetime import datetime
from okareo_api_client.models import TestRunType
from okareo.model_under_test import CustomModel
def query_results_to_score(results):
    parsed_ids_with_scores = []
    for i in range(0, len(results['distances'][0])):
        score = (2 - results['distances'][0][i]) / 2 # this turns cosine distance into a 0 to 1 cosine similarity score
        parsed_ids_with_scores.append((results['ids'][0][i], score))
    return parsed_ids_with_scores

class RetrievalModel(CustomModel):
    def invoke(self, input: str):
        results = collection.query(
            query_texts=[input],
            n_results=5
        )
        # return a tuple of (parsed_ids_with_scores, overall model response context)
        return query_results_to_score(results), {'model_data': input} 
# this will return a model if it already exists or create a new one if it doesn't
model_under_test = okareo.register_model(name=f"splade {datetime.now().strftime('%m-%d %H:%M:%S')}", model=RetrievalModel(name="splade"))


test_run_item = model_under_test.run_test(scenario=scenario, # use the scenario from the scenario set uploaded in the previous step
                                          name=f"Retrieval Test Run splade {datetime.now().strftime('%m-%d %H:%M:%S')}", # name for test run
                                          test_run_type=TestRunType.INFORMATION_RETRIEVAL,
                                          calculate_metrics=True)

# display model level metrics for the test run
print(f"See test run results: https://app.okareo.com/project/{test_run_item.project_id}/eval/{test_run_item.id}")

**Load vector database with e5 embeddings**

In [None]:
class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        def average_pool(last_hidden_states: Tensor,
                        attention_mask: Tensor) -> Tensor:
            last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
            return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
        for i in range(len(input)):
            input[i] = 'query: ' + (input[i] if isinstance(input[i], str) else '')

        tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')
        model = AutoModel.from_pretrained('intfloat/e5-small-v2')

        # Tokenize the input texts
        batch_dict = tokenizer(input, max_length=512, padding=True, truncation=True, return_tensors='pt')

        outputs = model(**batch_dict)
        embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        return embeddings.tolist()
chroma_client = chromadb.PersistentClient(path="./e5/")

collection = chroma_client.get_collection(name="chromadb", embedding_function=MyEmbeddingFunction())


**Run the scenario against the e5 model and calcuate retrieval metrics**

In [None]:
class RetrievalModel(CustomModel):
    def invoke(self, input: str):
        results = collection.query(
            query_texts=[input],
            n_results=5
        )
        # return a tuple of (parsed_ids_with_scores, overall model response context)
        return query_results_to_score(results), {'model_data': input}

# this will return a model if it already exists or create a new one if it doesn't
model_under_test = okareo.register_model(name="e5", model=RetrievalModel(name="e5 model"))

test_run_item = model_under_test.run_test(scenario=scenario, # use the scenario from the scenario set uploaded in the previous step
                                          name=f"Retrieval Test Run e5 {datetime.now().strftime('%m-%d %H:%M:%S')}", # name for test run
                                          test_run_type=TestRunType.INFORMATION_RETRIEVAL,
                                          calculate_metrics=True)

# display model level metrics for the test run
print(f"See test run results: {test_run_item.app_link}")