# Tutorial on using multiple models for evaluation
- This tutorial uses the file `multiple_model_utils.py` (in the same directory as this one) which contains the functions for training and evaluating multiple models. Please refer to the file for the implementation details!

In [8]:
from __future__ import annotations
from langchain_community.cache import SQLiteCache
import os
import sys
import chromadb
from pathlib import Path
from tqdm import tqdm

import pandas as pd
# change the path to the backend directory
sys.path.append(os.path.join(os.path.dirname("."), '../../backend/'))
from modules.utils import load_config_and_device
from modules.results_gen import get_result_from_query

In [None]:
from multiple_model_utils import setup_vector_db_and_qa, ollama_setup, process_embedding_model_name_hf, process_llm_model_name_ollama

In [4]:
new_path = Path("../../backend/")

config = load_config_and_device(str(new_path / "config.json"), training = True)

config["type_of_data"] = "dataset"
config["training"] = True

[INFO] Finding device.
[INFO] Device found: mps


## Defining the models used
- Embedding models are any from Huggingface hub
- LLM models are any from Ollama library

In [5]:
list_of_embedding_models = ["BAAI/bge-large-en-v1.5"]
list_of_llm_models = ["qwen2:1.5b", "phi3", "llama3"]

## Downloading the LLM models
- PLEASE MAKE SURE YOU HAVE DOWNLOADED OLLAMA
- Linux/Unix : ```curl -fsSL https://ollama.com/install.sh | sh```

In [None]:
ollama_setup(list_of_llm_models)

## Aggregate multiple queries, count and save the results
- This is part of the library already but is repeated here for clarity

In [None]:
def aggregate_multiple_queries_and_count(
    queries, qa_dataset, config, group_cols=["id", "name"], sort_by="query", count=True
) -> pd.DataFrame:
    """
    Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results

    Input:
        queries: List of queries
        group_cols: List of columns to group by

    Returns: Combined dataframe with the results of all queries
    """
    combined_df = pd.DataFrame()
    for query in tqdm(queries, total=len(queries)):
        result_data_frame, _ = get_result_from_query(
            query=query, qa=qa_dataset, type_of_query="dataset", config=config
        )
        result_data_frame = result_data_frame[group_cols]
        # Concat with combined_df with a column to store the query
        result_data_frame["query"] = query
        result_data_frame["llm_model"] = config["llm_model"]
        result_data_frame["embedding_model"] = config["embedding_model"]
        combined_df = pd.concat([combined_df, result_data_frame])
    if count:
        combined_df = (
            combined_df.groupby(group_cols)
            .count()
            .reset_index()
            .sort_values(by=sort_by, ascending=False)
        )

    return combined_df

## Override the original script to take a list of IDs as input
- The original script creates a subset of the dataset for testing, but here we want to give it a list of dataset IDs to test on. 
- So, we also disable the test_subset behavior and use a modified `setup_vector_db_and_qa` function

In [None]:
def run_experiments_and_save_results(config, new_path, list_of_embedding_models,list_of_llm_models):
    for embedding_model in tqdm(list_of_embedding_models, desc="Embedding Models", total=len(list_of_embedding_models)):
        for llm_model in tqdm(list_of_llm_models, desc="LLM Models", total=len(list_of_llm_models)):
            # update the config with the new embedding and llm models
            config["embedding_model"] = embedding_model
            config["llm_model"] = llm_model

            # create a new experiment directory using a combination of the embedding model and llm model names
            experiment_name = f"{process_embedding_model_name_hf(embedding_model)}_{process_llm_model_name_ollama(llm_model)}"
            experiment_path = new_path/Path(f"../data/experiments/{experiment_name}")

            # create the experiment directory if it does not exist
            os.makedirs(experiment_path, exist_ok=True)
        
            # update the config with the new experiment directories
            config["data_dir"] = str(experiment_path)
            config["persist_dir"] = str(experiment_path / "chroma_db")

            # save training details and config in a dataframe
            config_df = pd.DataFrame.from_dict(config, orient='index').reset_index()
            config_df.columns = ['Hyperparameter', 'Value']
            config_df.to_csv(experiment_path / "config.csv", index=False)

            # load the persistent database using ChromaDB
            client = chromadb.PersistentClient(path=config["persist_dir"])

            # Run "training"
            qa_dataset = setup_vector_db_and_qa(
                config=config, data_type=config["type_of_data"], client=client
            )
            
            # # Run an evaluation by aggregating multiple queries and counting the results
            # # TODO : Replace this evaluation with a more meaningful one
            combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query", count = False)

            # # TODO : ADD LLM evaluation here when the function is ready

            combined_df.to_csv(experiment_path / "results.csv")


## Setup evaluation data