## Environment

**Check if docker compose is installed and the docker daemon is running**

In [None]:
# installation
!docker compose version >/dev/null
!docker info >/dev/null

**Install Fondant**

In [None]:
!pip install -r ../requirements.txt

**Initiate the weaviate vectorDB**

In [None]:
# If you are using a MacBook with a M1 processor you have to make sure to set the docker default platform to linux/amd64
import os
os.environ["DOCKER_DEFAULT_PLATFORM"]="linux/amd64"

In [None]:
# Run Weaviate with Docker compose
!docker compose -f weaviate/docker-compose.yaml up --detach

In [None]:
# Make sure the vectorDB is running and accessible
import weaviate
local_weaviate_client = weaviate.Client("http://localhost:8080")
local_weaviate_client.schema.get()

## Grid-Search (still developing)

**Import the pipelines creator and the pipeline runner**

In [None]:
from fondant.pipeline.runner import DockerRunner
import pipeline_index, pipeline_eval

**Create the functions to run the different pipelines and output the results**

In [None]:
# get Host IP address
import socket

def get_host_ip():
    try:
        # This step is done to get the local machine's IP address
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        s.connect(("8.8.8.8", 80))
        host_ip = s.getsockname()[0]
    except Exception as e:
        print(f"Error while retrieving host IP address: {e}")
        host_ip = None
    finally:
        s.close()

    return host_ip

# Example usage
host_ip = get_host_ip()
print(f"Host IP address: {host_ip}")

In [None]:
# index pipeline runner
def run_indexing_pipeline(runner, index_pipeline, host_ip, weaviate_class_name):
    runner.run(index_pipeline)
    docker_weaviate_client = weaviate.Client(f"http://{host_ip}:8080")
    return docker_weaviate_client.schema.get(weaviate_class_name)

In [None]:
# eval pipeline runner
def run_evaluation_pipeline(runner, eval_pipeline, extra_volumes):
    runner.run(input=eval_pipeline, extra_volumes=extra_volumes)

In [None]:
# Read latest chosen component
import os
from datetime import datetime

import pandas as pd


def read_latest_data(base_path: str, pipeline_name: str, component_name: str):
    # Specify the path to the 'data' directory
    data_directory = f"{base_path}/{pipeline_name}"

    # Get a list of all subdirectories in the 'data' directory
    subdirectories = [
        d
        for d in os.listdir(data_directory)
        if os.path.isdir(os.path.join(data_directory, d))
    ]

    # keep pipeline directories
    valid_entries = [
        entry for entry in subdirectories if entry.startswith(pipeline_name)
    ]
    # keep pipeline folders containing a parquet file in the component folder
    valid_entries = [
        folder
        for folder in valid_entries
        if has_parquet_file(data_directory, folder, component_name)
    ]
    # keep the latest folder
    latest_folder = sorted(valid_entries, key=extract_timestamp, reverse=True)[0]

    # If a valid folder is found, proceed to read all Parquet files in the component folder
    if latest_folder:
        # Find the path to the component folder
        component_folder = os.path.join(data_directory, latest_folder, component_name)

        # Get a list of all Parquet files in the component folder
        parquet_files = [
            f for f in os.listdir(component_folder) if f.endswith(".parquet")
        ]

        if parquet_files:
            # Read all Parquet files and concatenate them into a single DataFrame
            dfs = [
                pd.read_parquet(os.path.join(component_folder, file))
                for file in parquet_files
            ]
            return pd.concat(dfs, ignore_index=True)
        return None
    return None


def has_parquet_file(data_directory, entry, component_name):
    component_folder = os.path.join(data_directory, entry, component_name)
    # Check if the component exists
    if not os.path.exists(component_folder) or not os.path.isdir(component_folder):
        return False
    parquet_files = [
        file for file in os.listdir(component_folder) if file.endswith(".parquet")
    ]
    return bool(parquet_files)


def extract_timestamp(folder_name):
    # Extract the timestamp part from the folder name
    timestamp_str = folder_name.split("-")[-1]
    # Convert the timestamp string to a datetime object
    return datetime.strptime(timestamp_str, "%Y%m%d%H%M%S")

**Run the Grid-Search**

In [None]:
# grid-search/iterative loop
import itertools
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

# Define evaluation dataset to load (csv file with a "question" column)
local_folder_absolute_path = "fondant-usecase-RAG/src/local_file" #TODO Repace with absolute Path
extra_volumes = [f"{local_folder_absolute_path}:/data"]

# Define the values for grid search
chunk_sizes = [256, 512]
chunk_overlaps = [10, 50]
embed_models = [("huggingface","all-MiniLM-L6-v2"), ("huggingface", "BAAI/bge-small-en")]
top_ks = [2, 5]

# Fixed parameters
fixed_args = {
    "pipeline_dir":"./data-dir",
    "weaviate_url":f"http://{host_ip}:8080", # IP address 
}
fixed_index_args = {
    "hf_dataset_name":"wikitext@~parquet",
    "data_column_name":"text",
    "n_rows_to_load":1000,
}
fixed_eval_args = {
    "csv_dataset_uri":"/data/wikitext_1000_q.csv", #make sure it is the same as mounted file
    "csv_column_separator":";",
    "question_column_name":"question",
    "module": "langchain.llms",
    "llm_name":"OpenAI",
    "llm_kwargs":{"openai_api_key": ""}, #TODO Specify your key in you're using OpenAI
    "metrics":["context_precision", "context_relevancy"]
}

# Define pipeline runner
runner = DockerRunner()

# Results dictionary to store results for each iteration
results_dict = {}

# Perform grid search
indexes = []
for i, (chunk_size, chunk_overlap, embed_model) in enumerate(itertools.product(chunk_sizes, chunk_overlaps, embed_models), start=1):
    index_config_class_name = f"IndexConfig{i}"
    logging.info(f"Running indexing for {index_config_class_name} with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, embed_model={embed_model}")

    # Store Indexing configuration
    index_dict = {}
    index_dict["index_name"] = index_config_class_name
    index_dict["chunk_size"] = chunk_size
    index_dict["chunk_overlap"] = chunk_overlap
    index_dict["embed_model"] = embed_model
    indexes.append(index_dict)
    
    # Create and Run the indexing pipeline
    indexing_pipeline = pipeline_index.create_pipeline(
        **fixed_args,
        **fixed_index_args,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        embed_model_provider=embed_model[0],
        embed_model=embed_model[1],
        weaviate_class_name=index_config_class_name
    )
    run_indexing_pipeline(
        runner=runner,
        index_pipeline=indexing_pipeline,
        host_ip=host_ip,
        weaviate_class_name=index_config_class_name
    )
    
grid_search_results = []
for i, (index_dict, top_k) in enumerate(itertools.product(indexes, top_ks), start=1):
    rag_config_name = f"RAGConfig{i}"
    logging.info(f"Running evaluation for {rag_config_name} with {index_dict['index_name']} and {top_k} retrieved chunks")

    # Store RAG pipeline configuration
    results_dict = {}
    results_dict["rag_config_name"] = rag_config_name
    results_dict.update(index_dict)
    results_dict["top_k"] = top_k

    # Create and Run the evaluation pipeline
    evaluation_pipeline = pipeline_eval.create_pipeline(
        **fixed_args,
        **fixed_eval_args,
        embed_model_provider=index_dict["embed_model"][0],
        embed_model=index_dict["embed_model"][1],
        weaviate_class_name=index_dict["index_name"],
        top_k=top_k,
    )
    run_evaluation_pipeline(
        runner=runner,
        eval_pipeline=evaluation_pipeline,
        extra_volumes=extra_volumes
    )

    # Save the evaluation results in the dictionary
    results_dict[f"agg_results_{rag_config_name}"] = read_latest_data(
        base_path="./data-dir",
        pipeline_name="evaluation-pipeline",
        component_name="aggregate_eval_results"
    )
    grid_search_results.append(results_dict)

# Print the results
for config, results in results_dict.items():
    print(config)
    print(results)
    print("/n")

## Exploring the dataset

You can explore your results using the fondant explorer, this enables you to visualize your output dataset at each component step. It might take a while to start the first time as it needs to download the explorer docker image first. 

Enjoy the exploration! 🍫 

In [None]:
from fondant.explore import run_explorer_app

run_explorer_app(base_path=fixed_args["pipeline_dir"])

**Read Latest Evaluated Pipeline Score**

You can also read the results for each RAG configuration ran. 

In [None]:
for results in grid_search_results:
    print("Results:")
    for key, value in results.items():
        if isinstance(value, pd.DataFrame):
            # If the value is a DataFrame, display it nicely
            print(f"  {key}:")
            print(value)
        else:
            print(f"  {key}: {value}")
    print("\n" + "=" * 30 + "\n")

## Clean up your environment

After your pipeline run successfully, you should clean up your environment and stop the weaviate database.

In [None]:
!docker compose -f weaviate/docker-compose.yaml down