In [None]:
# installation
!docker compose version >/dev/null
!docker info >/dev/null
!pip install -r ../requirements.txt

In [None]:
# If you are using a MacBook with a M1 processor you have to make sure to set the docker default platform to linux/amd64
import os
os.environ["DOCKER_DEFAULT_PLATFORM"]="linux/amd64"

In [None]:
# imports
from fondant.pipeline import ComponentOp, Pipeline
from pathlib import Path

from fondant.pipeline.runner import DockerRunner
import pipeline_index, pipeline_eval

In [None]:
# Make sure to run the weaviate instance
import weaviate
local_weaviate_client = weaviate.Client("http://localhost:8080")

In [None]:
fixed_args = {
    "pipeline_dir":"./data-dir",
    "embed_model_provider":"huggingface",
    "embed_model":"all-MiniLM-L6-v2",
    "weaviate_url":":8080", # IP address 
    "weaviate_class_name":"Pipeline_2",
}

In [None]:
# set-up the pipeline
indexing_args = {
    "hf_dataset_name":"wikitext@~parquet",
    "data_column_name":"text",
    "n_rows_to_load":1000,
    "chunk_size":256,
    "chunk_overlap":16
}

indexing_pipeline = pipeline_index.create_pipeline(**fixed_args, **indexing_args)

# indexing_pipeline = pipeline_index.create_pipeline(
#     pipeline_dir="./data-dir",
#     embed_model_provider="huggingface",
#     embed_model="all-MiniLM-L6-v2",
#     weaviate_url=":8080", # IP address 
#     weaviate_class_name="Pipeline_1",
#     hf_dataset_name="wikitext@~parquet",
#     data_column_name="text",
#     n_rows_to_load=1000,
#     chunk_size=512,
#     chunk_overlap=32
# )

In [None]:
# run the pipeline
def run_indexing_pipeline(runner, index_pipeline, docker_weaviate_client, weaviate_class_name):
    runner.run(index_pipeline)
    return docker_weaviate_client.schema.get(weaviate_class_name)

runner = DockerRunner()
docker_weaviate_client = weaviate.Client("http://192.168.64.1:8080")
weaviate_class_name = "Pipeline_1"

run_indexing_pipeline(runner=runner, index_pipeline=indexing_pipeline, docker_weaviate_client=docker_weaviate_client, weaviate_class_name=weaviate_class_name)

In [None]:
# set-up the pipeline
evaluation_args = {
    "csv_dataset_uri":"/data/wikitext_1000_q.csv", #make sure it is the same as mounted file
    "csv_column_separator":";",
    "question_column_name":"question",
    "top_k":3,
    "llm_name":"OpenAI",
    "llm_kwargs":{"openai_api_key": ""},
    "metrics":["context_precision", "context_relevancy"]
}

evaluation_pipeline = pipeline_eval.create_pipeline(**fixed_args, **evaluation_args)

# evaluation_pipeline = pipeline_eval.create_pipeline(
#     pipeline_dir="./data-dir",
#     embed_model_provider="huggingface",
#     embed_model="all-MiniLM-L6-v2",
#     weaviate_url=":8080", # IP address 
#     weaviate_class_name="Pipeline_1",
#     csv_dataset_uri="/data/wikitext_1000_q.csv", #make sure it is the same as mounted file
#     csv_column_separator=";",
#     question_column_name="question",
#     top_k=3,
#     llm_name="OpenAI",
#     llm_kwargs={"openai_api_key": ""},
#     metrics=["context_precision", "context_relevancy"]
# )


In [None]:
# run the pipeline
def run_evaluation_pipeline(runner, eval_pipeline, extra_volumes):
    runner.run(input=eval_pipeline, extra_volumes=extra_volumes)

runner = DockerRunner()
extra_volumes = ["/Users/hakimamri/Documents/GitHub/fondant-usecase-RAG/src/local_file:/data"]
run_evaluation_pipeline(runner, evaluation_pipeline, extra_volumes=extra_volumes)

In [None]:
# Read latest chosen component
import os
from datetime import datetime

import pandas as pd


def read_latest_data(base_path: str, pipeline_name: str, component_name: str):
    # Specify the path to the 'data' directory
    data_directory = f"{base_path}/{pipeline_name}"

    # Get a list of all subdirectories in the 'data' directory
    subdirectories = [
        d
        for d in os.listdir(data_directory)
        if os.path.isdir(os.path.join(data_directory, d))
    ]

    # keep pipeline directories
    valid_entries = [
        entry for entry in subdirectories if entry.startswith(pipeline_name)
    ]
    # keep pipeline folders containing a parquet file in the component folder
    valid_entries = [
        folder
        for folder in valid_entries
        if has_parquet_file(data_directory, folder, component_name)
    ]
    # keep the latest folder
    latest_folder = sorted(valid_entries, key=extract_timestamp, reverse=True)[0]

    # If a valid folder is found, proceed to read all Parquet files in the component folder
    if latest_folder:
        # Find the path to the component folder
        component_folder = os.path.join(data_directory, latest_folder, component_name)

        # Get a list of all Parquet files in the component folder
        parquet_files = [
            f for f in os.listdir(component_folder) if f.endswith(".parquet")
        ]

        if parquet_files:
            # Read all Parquet files and concatenate them into a single DataFrame
            dfs = [
                pd.read_parquet(os.path.join(component_folder, file))
                for file in parquet_files
            ]
            return pd.concat(dfs, ignore_index=True)
        return None
    return None


def has_parquet_file(data_directory, entry, component_name):
    component_folder = os.path.join(data_directory, entry, component_name)
    # Check if the component exists
    if not os.path.exists(component_folder) or not os.path.isdir(component_folder):
        return False
    parquet_files = [
        file for file in os.listdir(component_folder) if file.endswith(".parquet")
    ]
    return bool(parquet_files)


def extract_timestamp(folder_name):
    # Extract the timestamp part from the folder name
    timestamp_str = folder_name.split("-")[-1]
    # Convert the timestamp string to a datetime object
    return datetime.strptime(timestamp_str, "%Y%m%d%H%M%S")

In [None]:
pipeline_dir = "./data-dir"
read_latest_data(
            base_path=pipeline_dir,
            pipeline_name="evaluation-pipeline",
            component_name="aggregate_eval_results",
        )

## Grid-Search (not yet developed)

In [None]:
# grid-search/iterative loop
import itertools
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

# Define the values for grid search
chunk_sizes = [128, 256, 512]
chunk_overlaps = [10, 25, 50]
embed_models = [("huggingface","all-MiniLM-L6-v2"), ("huggingface""BAAI/bge-small-en"), ("huggingface""BAAI/bge-large-zh-v1.5")]
top_k = [2, 3, 5]

# Fixed parameters
hf_dataset_name = "wikitext@~parquet"
data_column_name = "text"
load_sample = True
weaviate_url = "http://host.docker.internal:8080"
csv_dataset_uri = "/data/wikitext_1000_q.csv"
csv_column_separator = ";"
question_column_name = "question"
llm_name = "OpenAI"
llm_kwargs = {"openai_api_key": ''} 
metrics = ["context_precision", "context_relevancy"]

# Results dictionary to store results for each iteration
results_dict = {}

# Perform grid search
for i, chunk_size, chunk_overlap, embed_model in enumerate(itertools.product(chunk_sizes, chunk_overlaps, embed_models, top_k), start=1):
    # Call the run_evaluation_pipeline function with the current parameter combination
    indexing_class_name = f"Index_{i}"
    logging.info(f"Running indexing for iteration {i} with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, embed_model={embed_model}")
    
    run_indexing_pipeline(
        hf_dataset_name=hf_dataset_name,
        data_column_name=data_column_name,
        load_sample=load_sample,
        chunk_size=chunk_sizes,
        chunk_overlap=chunk_overlaps,
        embed_model_provider=embed_models[0],
        embed_model=embed_models[1],
        weaviate_url=weaviate_url,
        weaviate_class_name=indexing_class_name
    )

    # Run evaluation pipeline
    evaluation_class_name = f"Index_{i}"
    logging.info(f"Running evaluation for iteration {i} with top_k={top_k}")

    results = run_evaluation_pipeline(
        csv_dataset_uri=csv_dataset_uri,
        csv_column_separator=csv_column_separator,
        question_column_name=question_column_name,
        embed_model_provider=embed_models[0],
        embed_model=embed_models[1],
        weaviate_url=weaviate_url,
        weaviate_class_name=indexing_class_name,
        top_k=top_k,  # Set your desired top_k value
        llm_name=llm_name,
        llm_kwargs=llm_kwargs
        metrics=metrics  # Set your desired metrics
    )

    # Save the results in the dictionary
    results_dict[(chunk_size, chunk_overlap, embed_model, top_k)] = results #results should be a dictionary with {"metric 1": x, "metric 2": y}

# Print or use the results as needed
print(results_dict)
