# Tutorial on using multiple models for evaluation
- This tutorial is an example of how to test multiple models on the openml data to see which one performs the best.
- The evaluation is still a bit basic, but it is a good starting point for future research.

In [1]:
from __future__ import annotations
from langchain_community.cache import SQLiteCache
import os
import sys
import chromadb
from pathlib import Path
from tqdm import tqdm

import pandas as pd
# change the path to the backend directory
sys.path.append(os.path.join(os.path.dirname("."), '../../backend/'))

In [2]:
from modules.utils import load_config_and_device
from modules.llm import setup_vector_db_and_qa
from modules.results_gen import aggregate_multiple_queries_and_count

## Setting the config

In [3]:
new_path = Path("../../backend/")

config = load_config_and_device(str(new_path / "config.json"), training = True)

config["type_of_data"] = "dataset"
config["training"] = True

[INFO] Finding device.
[INFO] Device found: mps


In [4]:
config["device"] = "cpu" # for testing

## Defining the models used
- Embedding models are any from Huggingface hub
- LLM models are any from Ollama library

In [9]:
list_of_embedding_models = ["BAAI/bge-small-en-v1.5"]
list_of_llm_models = ["qwen2:1.5b", "phi3"]

In [6]:
def process_embedding_model_name_hf(name : str) -> str:
    """
    Description: This function processes the name of the embedding model from Hugging Face to use as experiment name.
    
    Input: name (str) - name of the embedding model from Hugging Face.
    
    Returns: name (str) - processed name of the embedding model.
    """
    return name.replace("/", "_")

def process_llm_model_name_ollama(name : str) -> str:
    """
    Description: This function processes the name of the llm model from Ollama to use as experiment name.
    
    Input: name (str) - name of the llm model from Ollama.

    Returns: name (str) - processed name of the llm model.
    """
    return name.replace(":", "_")

## Defining the evaluation queries
- replace this with a proper dataframe for a more comprehensive evaluation

In [7]:
queries = ["Find datasets related to COVID-19", "Find datasets related to COVID-19 and India", "COVID-19 dataset", "COVID-19 dataset India", "Mexico historical covid"]

## Downloading the models
- PLEASE MAKE SURE YOU HAVE DOWNLOADED OLLAMA (```curl -fsSL https://ollama.com/install.sh | sh```)

In [7]:
# download the ollama llm models

# os.system("curl -fsSL https://ollama.com/install.sh | sh")
os.system("ollama serve&")
print("Waiting for Ollama server to be active...")  
while os.system("ollama list | grep 'NAME'") == "":
    pass

for llm_model in list_of_llm_models:
    os.system(f"ollama pull {llm_model}")

Waiting for Ollama server to be active...
NAME         	ID          	SIZE  	MODIFIED       


Error: listen tcp 127.0.0.1:11434: bind: address already in use
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest 
pulling 405b56374e02... 100% ▕████████████████▏ 934 MB                         
pulling 62fbfd9ed093... 100% ▕████████████████▏  182 B                         
pulling c156170b718e... 100% ▕████████████████▏  11 KB                         
pulling f02dd72bb242... 100% ▕████████████████▏   59 B                         
pulling c9f5e9ffbc5f... 100% ▕████████████████▏  485 B                         
verifying sha256 digest 
writing manifest 
removing any unused layers 
success [?25h
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25

## Running the steps
- Create an experiment directory
- Save a config file with the models and the queries in the experiment directory
- Download openml data for each dataset and format into a string
- Create vectorb and embed the data
- Get the predictions for each model for a list of queries and evaluate the performance
- (note) At the moment, this runs for a very small subset of the entire data. To disable this behavior and run on the entire data, set ```config["test_subset_2000"] = False```

In [10]:
# use a tiny subset of the data for testing
config["test_subset_2000"] = True

In [11]:
for embedding_model in tqdm(list_of_embedding_models, desc="Embedding Models", total=len(list_of_embedding_models)):
    for llm_model in tqdm(list_of_llm_models, desc="LLM Models", total=len(list_of_llm_models)):
        # update the config with the new embedding and llm models
        config["embedding_model"] = embedding_model
        config["llm_model"] = llm_model

        # create a new experiment directory using a combination of the embedding model and llm model names
        experiment_name = f"{process_embedding_model_name_hf(embedding_model)}_{process_llm_model_name_ollama(llm_model)}"
        experiment_path = new_path/Path(f"data/experiments/{experiment_name}")

        # create the experiment directory if it does not exist
        os.makedirs(experiment_path, exist_ok=True)
       
        # update the config with the new experiment directories
        config["data_dir"] = str(experiment_path)
        config["persist_dir"] = str(experiment_path / "chroma_db")

        # save training details and config in a dataframe
        config_df = pd.DataFrame.from_dict(config, orient='index').reset_index()
        config_df.columns = ['Hyperparameter', 'Value']
        config_df.to_csv(experiment_path / "config.csv", index=False)

        # load the persistent database using ChromaDB
        client = chromadb.PersistentClient(path=config["persist_dir"])

        # Run "training"
        qa_dataset = setup_vector_db_and_qa(
            config=config, data_type=config["type_of_data"], client=client
        )
        
        # Run an evaluation by aggregating multiple queries and counting the results
        # TODO : Replace this evaluation with a more meaningful one
        combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query", count = True)

        combined_df.to_csv(experiment_path / "results.csv")

Embedding Models:   0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Training is set to True.
[INFO] Subsetting the data to 100 rows.
[INFO] Initializing cache.
[INFO] Getting dataset metadata from OpenML.


QUEUEING TASKS | :   0%|          | 0/100 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/100 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/100 [00:00<?, ?it/s]

[INFO] Saving metadata to file.
[INFO] Loading model...




[INFO] Model loaded.
[INFO] Generating unique documents. Total documents: 992
Number of unique documents: 967 vs Total documents: 992



[A

Batches:   0%|          | 0/16 [00:00<?, ?it/s]


[A

Batches:   0%|          | 0/15 [00:00<?, ?it/s]


100%|██████████| 2/2 [03:06<00:00, 93.04s/it]

[A

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[A

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[A

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


100%|██████████| 5/5 [00:00<00:00,  8.16it/s]


[INFO] Training is set to True.
[INFO] Subsetting the data to 100 rows.
[INFO] Initializing cache.
[INFO] Getting dataset metadata from OpenML.


QUEUEING TASKS | :   0%|          | 0/100 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/100 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/100 [00:00<?, ?it/s]

[INFO] Saving metadata to file.
[INFO] Loading model...




[INFO] Model loaded.
[INFO] Generating unique documents. Total documents: 992
Number of unique documents: 967 vs Total documents: 992



[A

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/2 [00:43<?, ?it/s]
LLM Models:  50%|█████     | 1/2 [04:37<04:37, 277.59s/it]
Embedding Models:   0%|          | 0/1 [04:37<?, ?it/s]


KeyboardInterrupt: 