In [None]:
import boto3
import json
import os
from botocore.exceptions import ClientError
import pandas as pd
from configparser import ConfigParser, ExtendedInterpolation

In [None]:
def calculate_accuracy(row):
    """Calculate accuracy based on the overlap between actual and retrieved documents."""
    actual_docs = eval(row["document_id"])
    retrieved_docs = eval(row["retrieved_document_ids"])
    overlap = len(set(actual_docs) & set(retrieved_docs))
    accuracy = overlap / len(actual_docs)
    return accuracy

def process_file(filepath):
    """Load the file and calculate accuracy."""
    data = pd.read_excel(filepath)
    data["accuracy"] = data.apply(calculate_accuracy, axis=1)
    avg_accuracy = data["accuracy"].mean()
    return avg_accuracy

def extract_components_from_filename(filename):
    """
    Extract embedding, LLM, and dataset names from the file name using regex.
    """
    embedding_pattern = r"embedder_(.*?)_llm_"
    llm_pattern = r"_llm_(.*?)_dataset_"
    dataset_pattern = r"_dataset_(.*?)\.xlsx"

    # Extract components using regex
    embedding_match = re.search(embedding_pattern, filename)
    llm_match = re.search(llm_pattern, filename)
    dataset_match = re.search(dataset_pattern, filename)

    # Get the matched groups
    embedding = embedding_match.group(1) if embedding_match else None
    llm = llm_match.group(1) if llm_match else None
    dataset = dataset_match.group(1) if dataset_match else None

    return embedding, llm, dataset

def process_directory(directory):
    """Process all files in the directory and calculate accuracy."""
    results = []
    for file in os.listdir(directory):
        if file.startswith(
            "rag_evaluations_with_reank"
        ) and file.endswith(".xlsx"):
            filepath = os.path.join(directory, file)
            embedding, llm, dataset = extract_components_from_filename(file)
            avg_accuracy = process_file(filepath)
            results.append(
                {
                    "filename": file,
                    "embedding_model": embedding,
                    "llm_model": llm,
                    "dataset": dataset,
                    "avg_accuracy": avg_accuracy,
                }
            )
    return results

directory_path = "/home/ubuntu/Multi-Agent-LLM-System-with-LangGraph-RAG-and-LangChain/src/my_rag/evaluations/results"
results = process_directory(directory_path)
results_df = pd.DataFrame(results)
results_df.to_csv("accuracy_results.csv", index=False)

In [None]:
config = ConfigParser(interpolation=ExtendedInterpolation())
config.read(
    f"/home/ubuntu/Multi-Agent-LLM-System-with-LangGraph-RAG-and-LangChain/config/config.ini"
)

session = boto3.Session(
    aws_access_key_id=config["BedRock_LLM_API"]["aws_access_key_id"],
    aws_secret_access_key=config["BedRock_LLM_API"]["aws_secret_access_key"],
    aws_session_token=config["BedRock_LLM_API"]["aws_session_token"],
)
bedrock_client = session.client("bedrock-runtime", region_name="us-east-1")
model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"

In [None]:
def evaluate_answer(question, ideal_answer, llm_generated_answer):
    """Send LLM answers to Claude API for evaluation."""
    content = f"""
    {question}
    Evaluate the following answer for correctness:
    Ideal Answer: {ideal_answer}
    LLM Generated Answer: {llm_generated_answer}

    Classify the LLM Generated Answer as 'correct', 'partially correct' or 'incorrect' based on these criteria:
    - Correct: Contains all the specific information that answers the things directly asked in the question .
    - Partially Correct: if contains only partial information but contains only correct information that is if ideal answer has more content than ideal answer that is directly asked in the question
    - Incorrect: Fails to address the entire question or Misses specific details asked in the question or contains incorrect information. Ideal answer may contain more irrelavant information that does not make the llm generated answer incorrect if not asked in the question

    Provide only the classification.
    """
    body_content = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2,
        "temperature": 0.1,
        "top_p": 0.9,
        "messages": [{"role": "user", "content": content}],
    }

    try:
        response = bedrock_client.invoke_model(
            modelId=model_id, body=json.dumps(body_content)
        )
        response_body = response["body"].read().decode()
        # print(response_body)
        return json.loads(response_body)["content"][0]["text"]
    except ClientError as e:
        print(f"Error invoking Claude API: {e}")
        return None

def classify_response(text):
    """Classify the Claude response as correct, partially correct, or incorrect."""
    text_lower = text.lower()
    if "incorrect" in text_lower:
        return "incorrect"
    elif "partially correct" in text_lower:
        return "partially_correct"
    elif "correct" in text_lower:
        return "correct"
    else:
        return "unknown"

# def process_files(directory):
#     """Process all files, evaluate answers, and save results."""
#     results = []
#     for file in os.listdir(directory):
#         if file.startswith(
#             "rag_evaluations"
#         ) and file.endswith(".xlsx"):
#             filepath = os.path.join(directory, file)
#             data = pd.read_excel(filepath)

#             embedding, llm, dataset = extract_components_from_filename(file)
#             evaluations = []

#             for _, row in data.iterrows():
#                 question = row["question"]
#                 ideal_answer = row["ideal_answer"]
#                 llm_generated_answer = row["llm_generated_answer"]
#                 response = evaluate_answer(question, ideal_answer, llm_generated_answer)
#                 classification = classify_response(response) if response else "error"
#                 evaluations.append(
#                     {
#                         "embedding_model": embedding,
#                         "reranking_model": "colbert" if "_with_reank" in file else "",
#                         "llm_model": llm,
#                         "dataset": dataset,
#                         "question": question,
#                         "ideal_answer": ideal_answer,
#                         "llm_answer": llm_generated_answer,
#                         "classification_and_explanation_text_by_claude": response,
#                         "classification": classification,
#                     }
#                 )

#             # Append evaluations to results
#             results.extend(evaluations)

#     # Save results to CSV
#     output_file = os.path.join(directory, "evaluated_results.csv")
#     results_df = pd.DataFrame(results)
#     results_df.to_csv(output_file, index=False)
#     print(f"Results saved to {output_file}")

In [28]:
def process_files(directory):
    """Process all files, evaluate answers, and save results."""
    output_file = os.path.join(directory, "evaluated_results.csv")
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        existing_df = existing_df.fillna("")
        processed_models = {
            (embedding, llm, reranking_model, dataset)
             for (
                embedding,
                llm,
                reranking_model,
                dataset,
            ), group in existing_df.groupby(
                ["embedding_model", "llm_model", "reranking_model", "dataset"]
            )
        }
    for file in os.listdir(directory):
        if (
            file.startswith("rag_evaluations")
            and file.endswith(".xlsx")
        ):
            print(file)
            filepath = os.path.join(directory, file)
            data = pd.read_excel(filepath)

            embedding, llm, dataset = extract_components_from_filename(file)
            reranking_model =  "colbert" if "_with_reank" in file else ""
            if (embedding, llm, reranking_model, dataset) in processed_models:
                # print((embedding, llm, reranking_model, dataset))
                # print((embedding, llm, reranking_model, dataset))
                # continue
                pass
            evaluations = []
            print("processing")
            print((embedding, llm, reranking_model, dataset))

            for _, row in data.iterrows():
                question = row["question"]
                ideal_answer = row["ideal_answer"]
                llm_generated_answer = row["llm_generated_answer"]
                response = evaluate_answer(question, ideal_answer, llm_generated_answer)
                classification = classify_response(response) if response else "error"
                evaluations.append(
                    {
                        "embedding_model": embedding,
                        "reranking_model": reranking_model,
                        "llm_model": llm,
                        "dataset": dataset,
                        "question": question,
                        "ideal_answer": ideal_answer,
                        "llm_answer": llm_generated_answer,
                        "classification_and_explanation_text_by_claude": response,
                        "classification": classification,
                    }
                )

            # Save or append results to CSV
            output_file = os.path.join(directory, "evaluated_results.csv")
            if os.path.exists(output_file):
                existing_df = pd.read_csv(output_file)
                existing_df = existing_df.fillna("")
                new_results_df = pd.DataFrame(evaluations)
                new_results_df = new_results_df.fillna("")
                combined_df = pd.concat([existing_df, new_results_df], ignore_index=True)
                combined_df.drop_duplicates(
                    subset=[
                        "embedding_model",
                        "llm_model",
                        "reranking_model",
                        "dataset",
                        "question",
                        "ideal_answer",
                        "llm_answer",
                    ],
                    inplace=True,
                )
                combined_df.to_csv(output_file, index=False)
            else:
                results_df = pd.DataFrame(results)
                results_df.to_csv(output_file, index=False)

    print(f"Results saved to {output_file}")

In [29]:
directory_path = "/home/ubuntu/Multi-Agent-LLM-System-with-LangGraph-RAG-and-LangChain/src/my_rag/evaluations/results"  # Replace with your directory path
process_files(directory_path)

rag_evaluations_embedder_mixedbread-ai_mxbai-embed-large-v1_llm_anthropic.claude-3-5-sonnet-20240620-v1_0_dataset_PubMed filtered Dataset.xlsx
processing
('mixedbread-ai_mxbai-embed-large-v1', 'anthropic.claude-3-5-sonnet-20240620-v1_0', '', 'PubMed filtered Dataset')
rag_evaluations_with_reank_embedder_sentence-transformers_all-MiniLM-L6-v2_llm_meta-llama_Meta-Llama-3-8B-Instruct_dataset_PubMed filtered Dataset.xlsx
processing
('sentence-transformers_all-MiniLM-L6-v2', 'meta-llama_Meta-Llama-3-8B-Instruct', 'colbert', 'PubMed filtered Dataset')


KeyboardInterrupt: 

In [24]:
def calculate_classification_percentages(input_csv, output_csv):
    """Calculate classification percentages for each combination of embedding_model, llm_model, and dataset."""
    # Load the evaluated results
    data = pd.read_csv(input_csv)

    # Group by embedding_model, llm_model, and dataset
    grouped = data.groupby(
        ["embedding_model", "llm_model", "reranking_model", "dataset"]
    )

    # Calculate percentages
    results = []
    for (embedding, llm, reranking_model, dataset), group in grouped:
        total = len(group)
        counts = group["classification"].value_counts().to_dict()

        percentages = {
            f"{key}_percentage": (value / total) * 100 for key, value in counts.items()
        }
        result = {
            "embedding_model": embedding,
            "llm_model": llm,
            "reranking_model": reranking_model,
            "dataset": dataset,
            "total": total,
        }
        result.update(percentages)
        results.append(result)

    # Convert results to a DataFrame and save to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_csv, index=False)
    print(f"Classification percentages saved to {output_csv}")


# Example usage
input_csv = "/home/ubuntu/Multi-Agent-LLM-System-with-LangGraph-RAG-and-LangChain/src/my_rag/evaluations/results/evaluated_results.csv"  # Replace with the path to your evaluated results CSV
output_csv = (
    "classification_percentages.csv"  # Replace with the desired output file path
)
calculate_classification_percentages(input_csv, output_csv)

Classification percentages saved to classification_percentages.csv
