# Parsing Evaluation with Chroma

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from chunking_evaluation import GeneralEvaluation
from dotenv import load_dotenv
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

from lib.evaluation.chunking.chroma.chroma_adapter import ChromaChunker
from lib.chunking.methods.implementations.fixed_sized import FixedSizeChunker
from lib.chunking.methods.implementations.recursive import RecursiveChunker
from lib.chunking.methods.implementations.hierarchical import HierarchicalChunker
from lib.chunking.methods.implementations.semantic import SemanticChunker
from lib.parsing.methods.parsers import Parsers

In [None]:
chunkers = [
    FixedSizeChunker(max_tokens=128, overlap=0),
    FixedSizeChunker(max_tokens=128, overlap=26),
    FixedSizeChunker(max_tokens=256, overlap=0),
    FixedSizeChunker(max_tokens=256, overlap=52),
    FixedSizeChunker(max_tokens=512, overlap=0),
    FixedSizeChunker(max_tokens=512, overlap=104),
    FixedSizeChunker(max_tokens=1024, overlap=0),
    FixedSizeChunker(max_tokens=1024, overlap=208),
    RecursiveChunker(max_tokens=128, overlap=0),
    RecursiveChunker(max_tokens=128, overlap=26),
    RecursiveChunker(max_tokens=256, overlap=0),
    RecursiveChunker(max_tokens=256, overlap=52),
    RecursiveChunker(max_tokens=512, overlap=0),
    RecursiveChunker(max_tokens=512, overlap=104),
    RecursiveChunker(max_tokens=1024, overlap=0),
    RecursiveChunker(max_tokens=1024, overlap=208),
    SemanticChunker(max_tokens=128, similarity_threshold=90, min_tokens=64),
    SemanticChunker(max_tokens=128, similarity_threshold=70, min_tokens=64),
    SemanticChunker(max_tokens=256, similarity_threshold=90, min_tokens=128),
    SemanticChunker(max_tokens=256, similarity_threshold=70, min_tokens=128),
    SemanticChunker(max_tokens=512, similarity_threshold=90, min_tokens=256),
    SemanticChunker(max_tokens=512, similarity_threshold=70, min_tokens=256),
    SemanticChunker(max_tokens=1024, similarity_threshold=90, min_tokens=512),
    SemanticChunker(max_tokens=1024, similarity_threshold=70, min_tokens=512),
    HierarchicalChunker(max_tokens=128, max_parent_token_ratio=0.2),
    HierarchicalChunker(max_tokens=256, max_parent_token_ratio=0.2),
    HierarchicalChunker(max_tokens=512, max_parent_token_ratio=0.2),
    HierarchicalChunker(max_tokens=1024, max_parent_token_ratio=0.2),
]

In [None]:
chunkers = [
    ChromaChunker(c) for c in chunkers
]

In [None]:
# Make sure "OPENAI_API_KEY" is set in a .env file
load_dotenv()
embedding_function = OpenAIEmbeddingFunction(model_name="text-embedding-3-small")

## General Evaluation (predefined data sets)

In [None]:
evaluation = GeneralEvaluation()

results = []
pubmed_results = []

for chunker in chunkers:
    c_info = chunker.get_info()
    print(c_info)

    c_res = evaluation.run(chunker, embedding_function)
    pubmed_scores = c_res["corpora_scores"]["pubmed"]
    del c_res["corpora_scores"]

    # Add method info to all-corpus-mean scores
    c_res.update(c_info)
    results.append(c_res)

    # Get data for pubmed only
    c_row = {}
    for metric, scores in pubmed_scores.items():
        metric_name = metric.replace("_scores", "")
        c_row[f"{metric_name}_mean"] = np.mean(scores)
        c_row[f"{metric_name}_std"] = np.std(scores)

    c_row.update(c_info)
    pubmed_results.append(c_row)

In [None]:
from lib.utils.chroma_export import export_results

df = pd.DataFrame(results)
combined_df = export_results(df, "chroma_results_general")

In [None]:
pubmed_df = pd.DataFrame(pubmed_results)
combined_pubmed_df = export_results(pubmed_df, "chroma_results_pubmed")

## Synthetic Evaluation

In [None]:
from lib.evaluation.chunking.chroma.chroma_setup import setup_evaluation_from_medical_qas, \
    get_db_path

parser = Parsers.DOCLING

db_path = get_db_path(parser)
synth_eval = setup_evaluation_from_medical_qas(
    parser,
    "awmf",
    parse_exist_ok=True,
    question_exist_ok=True
)

In [None]:
results = []

for chunker in chunkers:
    c_info = chunker.get_info()
    print(c_info)

    c_res = synth_eval.run(
        chunker,
        embedding_function,
        retrieve=-1,
        db_to_save_chunks=db_path
    )
    del c_res["corpora_scores"]

    c_res.update(c_info)
    results.append(c_res)

In [None]:
pd.DataFrame(results)

In [None]:
from lib.utils.chroma_export import export_results

df = pd.DataFrame(results)
combined_df = export_results(df, "chroma_results_concology")

In [None]:
combined_df