# Tutorial on using multiple models for evaluation
- This tutorial uses the file `multiple_model_utils.py` (in the same directory as this one) which contains the functions for training and evaluating multiple models. Please refer to the file for the implementation details!

!!!! IMPORTANT !!!!
- Run the `./start_llm_service.sh` (root of this repository) before running this notebook. This will start the Language Model service. Without this, parts of the code will not work.

In [1]:
from __future__ import annotations
import os
import sys

from pathlib import Path
import json

import glob

# change the path to the backend directory
sys.path.append(os.path.join(os.path.dirname("."), "../../backend/"))
sys.path.append(os.path.join(os.path.dirname("."), "../../llm_service/"))
from modules.utils import load_config_and_device
from multiple_model_utils import *
from llm_service_utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
new_path = Path("../../backend/")

config = load_config_and_device(str(new_path / "config.json"), training=True)

config["type_of_data"] = "dataset"
config["training"] = True

[INFO] Finding device.
[INFO] Device found: mps


## Defining the models used
- Embedding models are any from Huggingface hub
- LLM models are any from Ollama library

In [3]:
list_of_embedding_models = [
    "BAAI/bge-large-en-v1.5",
    "BAAI/bge-base-en-v1.5",
    "Snowflake/snowflake-arctic-embed-l",
]
# list_of_llm_models = ["qwen2:1.5b", "llama3"]
# list_of_llm_models = ["llama3"]
list_of_llm_models = ["qwen2:1.5b"]

## Downloading the LLM models
- PLEASE MAKE SURE YOU HAVE DOWNLOADED OLLAMA
- Linux/Unix : ```curl -fsSL https://ollama.com/install.sh | sh```

In [4]:
ollama_setup(list_of_llm_models)

Waiting for Ollama server to be active...
NAME         	ID          	SIZE  	MODIFIED      


Error: listen tcp 127.0.0.1:11434: bind: address already in use
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest 
pulling 405b56374e02... 100% ▕████████████████▏ 934 MB                         
pulling 62fbfd9ed093... 100% ▕████████████████▏  182 B                         
pulling c156170b718e... 100% ▕████████████████▏  11 KB                         
pulling f02dd72bb242... 100% ▕████████████████▏   59 B                         
pulling c9f5e9ffbc5f... 100% ▕████████████████▏  485 B                         
verifying sha256 digest 
writing manifest 
removing any unused layers 
success [?25h


## Setup evaluation data
### If you used tools/app.py to generate evaluation data
- You can ignore this and use the data generated by the tool
### If you did not
- You can use evaluation data of the format {"id": ["tag1", "tag2"] } and save it as a json file
- eg: ```{"43843": ["Climate change"], "43365": ["COVID-19"], "43684": ["COVID-19"]}```

In [5]:
query_templates = [
    "Find me a dataset about",
    "Dataset on",
    "list datasets about",
    "Can you locate a dataset on",
    "I'm looking for a dataset related to",
    "Please help me find a dataset concerning",
    "Is there a dataset available for",
    "Could you provide a dataset on",
    "I need a dataset regarding",
    "Can you source a dataset about",
    "I'd like to get a dataset about",
    "Can you identify a dataset related to",
    "Do you have access to a dataset on",
]

In [6]:
with open("../../data/evaluation/merged_labels.json", "r") as f:
    merged_labels = json.load(f)
    # get the dataset ids we want out evaluation to be based on (these are dataset ids for the openml datasets)
    subset_ids = list(merged_labels.keys())

In [7]:
df_queries = get_dataset_queries(subset_ids, query_templates, merged_labels)
queries = df_queries["query"].values

## Test accuracy with different embeddings

In [8]:
run_experiments(
    config,
    new_path,
    queries,
    list_of_embedding_models,
    list_of_llm_models,
    subset_ids,
    use_cached_experiment=True,
)

Embedding Models:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Training is set to True.
[INFO] Initializing cache.
[INFO] Getting dataset metadata from OpenML.



[A
QUEUEING TASKS | : 100%|██████████| 5703/5703 [00:00<00:00, 43557.01it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
PROCESSING TASKS | : 100%|██████████| 5703/5703 [00:07<00:00, 724.08it/s] 

COLLECTING RESULTS | : 100%|██████████| 5703/5703 [00:00<00:00, 1333042.56it/s]


[INFO] Saving metadata to file.
[INFO] Loading model...
[INFO] Model loaded.
[INFO] Generating unique documents. Total documents: 960
Number of unique documents: 0 vs Total documents: 960
No new documents to add.



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 240/240 [05:25<00:00, 

[INFO] Training is set to True.
[INFO] Initializing cache.
[INFO] Getting dataset metadata from OpenML.



QUEUEING TASKS | : 100%|██████████| 5703/5703 [00:00<00:00, 211740.53it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
PROCESSING TASKS | : 100%|██████████| 5703/5703 [00:12<00:00, 469.35it/s]

COLLECTING RESULTS | : 100%|██████████| 5703/5703 [00:00<00:00, 1351495.32it/s]


[INFO] Saving metadata to file.
[INFO] Loading model...
[INFO] Model loaded.
[INFO] Generating unique documents. Total documents: 960
Number of unique documents: 0 vs Total documents: 960
No new documents to add.



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 240/240 [05:20<00:00, 

[INFO] Training is set to True.
[INFO] Initializing cache.
[INFO] Getting dataset metadata from OpenML.



QUEUEING TASKS | : 100%|██████████| 5703/5703 [00:00<00:00, 181203.39it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
PROCESSING TASKS | : 100%|██████████| 5703/5703 [00:14<00:00, 385.16it/s] 

COLLECTING RESULTS | : 100%|██████████| 5703/5703 [00:00<00:00, 633393.77it/s]


[INFO] Saving metadata to file.
[INFO] Loading model...







[INFO] Model loaded.
[INFO] Generating unique documents. Total documents: 960
Number of unique documents: 0 vs Total documents: 960
No new documents to add.



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 240/240 [05:26<00:00, 

## Exp with llm after rag

In [10]:
run_experiments(
    config,
    new_path,
    queries,
    list_of_embedding_models,
    list_of_llm_models,
    subset_ids,
    use_cached_experiment=True,
    enable_llm_results= True,
    apply_llm_before_rag= False,
)

Embedding Models:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Training is set to True.
[INFO] Initializing cache.
[INFO] Getting dataset metadata from OpenML.



QUEUEING TASKS | : 100%|██████████| 5703/5703 [00:00<00:00, 164006.03it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
PROCESSING TASKS | : 100%|██████████| 5703/5703 [00:13<00:00, 417.71it/s]

COLLECTING RESULTS | : 100%|██████████| 5703/5703 [00:00<00:00, 1037434.00it/s]


[INFO] Saving metadata to file.
[INFO] Loading model...
[INFO] Model loaded.
[INFO] Generating unique documents. Total documents: 960
Number of unique documents: 0 vs Total documents: 960
No new documents to add.



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 88%|████████▊ | 212/240 [05:09<00:40,  1.46s/it]
LLM Models:   0%|          | 0/1 [05:43<?, ?it/s]
Embedding Models:   0%|          | 0/3 [05:43<?, ?i

KeyboardInterrupt: 

## Load the results and evaluate

In [11]:
# glob all csv files in the experiments directory
experiment_dir = Path(f"../../data/experiments/")
csv_files = glob.glob(str(experiment_dir / "*/results.csv"))

In [13]:

def create_results_dict(csv_files, df_queries):
    # create a dictionary to store the results
    results_dict = {}
    for exp_path in csv_files:
        folder_name = Path(exp_path).parent.name
        exp = pd.read_csv(exp_path)
        # create y_pred
        exp["y_pred"] = exp["did"].astype(str)

        # for each row, get the true label from the df_queries dataframe
        for i, row in exp.iterrows():
            res = df_queries[df_queries["query"] == row["query"]].values[0][1]
            exp.at[i, "y_true"] = res

        # get unique queries
        all_queries = exp["query"].unique()

        # calculate number of correct and wrong predictions
        correct, wrong = 0, 0
        for query in all_queries:
            ypred = exp[exp["query"] == query]["y_pred"].unique()
            ytrue = exp[exp["query"] == query]["y_true"].unique()
            if ypred in ytrue:
                correct += 1
            else:
                wrong += 1
        results_dict[folder_name] = {"correct": correct, "wrong": wrong}
    return results_dict

In [14]:
results_dict = create_results_dict(csv_files, df_queries)

In [15]:
pd.DataFrame.from_dict(results_dict, orient="index")

Unnamed: 0,correct,wrong
Snowflake_snowflake-arctic-embed-l_llm_none,149,51
BAAI_bge-large-en-v1.5_llm_none,172,28
BAAI_bge-base-en-v1.5_llm_none,172,28


In [None]:
pd.DataFrame.from_dict(results_dict, orient="index").to_csv(
    "../../data/experiments/results.csv"
)