# Overview
* This notebook was used to review input data, design, build and test components for use in the main tool.

# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pandas as pd
import chromadb
from chromadb.config import Settings

import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

from utils import safe_load_json

from dotenv import load_dotenv
from claude import call_claude

# Load environment variables from the .env file
load_dotenv()

from base import BASE_DIR

# Data

## Input

In [4]:
from utils import load_data
data = load_data()

In [5]:
db_client = chromadb.PersistentClient(path=f"{BASE_DIR}/embeddings/voiyage-2")

from embeddings import call_vo_embeddings, count_tokens, chunk_text

## Output

In [71]:
test_main = pd.read_parquet(f"{BASE_DIR}/output/main/method-002/data.parquet")
print(test_main.shape)
test_main.head(1).T

(246, 12)


Unnamed: 0,0
document_title,CerenceInc_20191002_8-K_EX-10.4_11827494_EX-10...
question,Highlight the parts (if any) of this contract ...
context,/2/2019\n\n\n\n\n\nparty agrees that such part...
context_embedding,[[-0.01682035 0.04582816 0.06161021 ... 0.0...
answer,specific performance
gt_contexts,[exhibit 10.4\n\nintellectual property agreeme...
gt_context_embedding,"[-0.019412100315093994, 0.014370781369507313, ..."
gt_answer_starts,[3012]
gt_answers,"[INTELLECTUAL PROPERTY AGREEMENT, d]"
gt_is_impossible,False


In [73]:
test_summary = pd.read_parquet(f"{BASE_DIR}/output/chunk_engineering/method-002/summary.parquet")
test_summary.head().T

Unnamed: 0,0,1,2
collection_name,legal_docs_voiyage2_1024,legal_docs_voiyage2_2048,legal_docs_voiyage2_512
Mean Reciprocal Rank (MRR),0.083333,0.458333,0.041667
Precision at 5 (P@5),0.083333,0.458333,0.041667
llm_gt_containment_proportion,0.0,0.041667,0.0
llm_unigram_match,0.458333,0.6875,0.5
llm_bigram_match,0.286458,0.473958,0.348958
llm_trigram_match,0.09375,0.145833,0.145833
llm_precision,0.306744,0.382743,0.258761
llm_recall,0.153882,0.255137,0.162771
llm_f1_score,0.145472,0.255232,0.182372


# RAG

In [None]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json

def get_long_keywords():
    with open(f'{BASE_DIR}/steer/question_keywords_long_answers.json', 'r') as f:
        return json.load(f)

def preprocess_question(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

def contains_keywords(question, keywords):
    question_tokens = set(preprocess_question(question)) 
    return any(keyword in question_tokens for keyword in keywords)

def retrieve_answer(question, n_results, document_title, collection):
    question_embedding = call_vo_embeddings([question])
    
    results = collection.query(
        query_embeddings=question_embedding.embeddings,
        n_results=n_results,  # Return the closest match based on the number of results
        where={"doc_id": document_title}  # Filter by document title
    )
    
    context = results["documents"][0][0] if results["documents"] else None
    context_embedding = collection.get(ids=results['ids'][0], include=['embeddings'])['embeddings']

    if context:
        print(f"Context tokens: {count_tokens(context)}")
        prompt = f"Your job is to answer legal questions in a succinct manner, providing only the essential information without any unnecessary preamble or additional commentary.\nIf you cannot find the answer or you lack information to answer the question write 'Answer is impossible'.\nHere is the question you need to answer:\n<question>\n{question}\n</question>\n\nContext:\n{context}\n\nPlease provide a direct and concise answer to this question. Focus solely on the most relevant legal information or response.\n\nFormat your response as a JSON object with a single key \"answer\" whose value is an array containing your response as a string. For example:\n\n{{\n\"answer\": ['INTELLECTUAL PROPERTY AGREEMENT']\n}}\Or:\n{{\n\"answer\": ['Answer is impossible.']}}\nOr:\n{{\n\"answer\": ['CERENCE INC.', 'SpinCo', 'Nuance', 'NUANCE COMMUNICATIONS, INC.']}}\nOr:\n{{\n\"answer\": ['September 30, 2019']}}"

        print(f"Prompt + context tokens: {count_tokens(prompt)}")
        answer = call_claude(prompt)
        answer = answer[0].text
        
        return answer, context, context_embedding
    else:
        print(f"No relevant chunk found for document {document_title}.")
        return None, None, None
    
def find_gt_chunk(chunks, answer_starts):
    current_pos = 0
    valid_chunks = []  # To collect all valid chunks for different answer_start

    # Loop through all answer starts
    for answer_start in answer_starts:
        for chunk in chunks:
            chunk_length = len(chunk)

            if answer_start <= chunk_length:
                valid_chunks.append(chunk)  # If answer_start is within this chunk, keep it
                break  # No need to check further chunks for this answer_start
            else:
                current_pos += chunk_length

                if answer_start <= current_pos:
                    valid_chunks.append(chunk)  # Found the chunk with the answer_start
                    break

    return list(set(valid_chunks))  # Return all valid chunks that could match

def run_faq_rag(data, collection_name):
    collection = db_client.get_or_create_collection(name=collection_name)
    print(f"Created ChromaDB collection: {collection_name}")

    dfs = []

    for document in data[:1]:
        document_title = document["title"]  # Store document title for relevant retrieval
        for paragraph in document["paragraphs"][:1]:
            context = paragraph["context"]
            
            token_count = int(collection_name.split('_')[-1])
            chunks = chunk_text(context, max_tokens=token_count)

            for qa in paragraph["qas"][:2]:
                question = qa["question"]
                gt_answers = [answer["text"] for answer in qa["answers"]]
                gt_answer_starts = [answer["answer_start"] for answer in qa["answers"]] 
                gt_is_impossible = qa.get("is_impossible", False)

                if not gt_is_impossible:  # Should we filter for impossible to answer for all
                    gt_contexts = find_gt_chunk(chunks, gt_answer_starts)
                    gt_context_embedding = call_vo_embeddings(gt_contexts).embeddings[0]
                else:
                    gt_contexts = []
                    gt_context_embedding = []

                if contains_keywords(question, get_long_keywords()):
                    n_results = 5  # Retrieve more documents for long-answer questions
                else:
                    n_results = 1  # Retrieve only one document for short-answer questions

                start_time = time.time()
                answer, best_chunk, best_chunk_embedding = retrieve_answer(question, n_results, document_title, collection)
                end_time = time.time()
                time_taken = end_time - start_time

                dfs.append({
                    "document_title": document_title,
                    "question": question,
                    "context": best_chunk,
                    "context_embedding": best_chunk_embedding,
                    "answer": answer,
                    "gt_contexts": gt_contexts,
                    "gt_context_embedding": gt_context_embedding,
                    "gt_answer_starts": gt_answer_starts,
                    "gt_answers": gt_answers,
                    "gt_is_impossible": gt_is_impossible,
                    "time_taken": time_taken
                })
    df = pd.DataFrame(dfs)
    df["collection_name"] = collection_name
    return df

client = chromadb.PersistentClient(path=f"{BASE_DIR}/embeddings/voiyage-2")
collections = client.list_collections()
available_collections = [c.name for c in collections]
if 'legal_docs_voiyage2' in available_collections:
    available_collections.remove('legal_docs_voiyage2')

dfs = []

for collection_name in available_collections:
    df = run_faq_rag(data, collection_name)
    dfs.append(df)

# Output / Eval

# Eval Final

In [None]:
from eval import evaluate_llm_response, fuzzmatch_llm_scoring, evaluate_context_response

def evaluate_rag(df):
    is_possible = df[df.gt_is_impossible == False]
    print(is_possible.shape)
    impossible = df[df.gt_is_impossible == True]
    print(impossible.shape)

    apply_llm_similarity_evaluation(is_possible, 'llm', 'gt_answers', 'answer')
    is_possible = fuzzmatch_llm_scoring(is_possible, threshold=80)
    apply_context_similarity_evaluation(is_possible, 'embeddings', 'gt_contexts', 'context')

    # Group by 'collection_name' and calculate mean for most metrics, but proportion for llm_gt_containment
    summary_df = is_possible.groupby('collection_name').agg({
        'llm_gt_containment': 'mean',  # This will give the proportion of True values
        'llm_unigram_match': 'mean',
        'llm_bigram_match': 'mean',
        'llm_trigram_match': 'mean',
        'llm_precision': 'mean',
        'llm_recall': 'mean',
        'llm_f1_score': 'mean',
        'llm_r1_recall': 'mean',
        'llm_r1_precision': 'mean',
        'llm_r1_f1': 'mean',
        'embeddings_bleu_score': 'mean',
        'embeddings_r2_recall': 'mean',
        'embeddings_r2_precision': 'mean',
        'embeddings_r2_f1': 'mean'
    }).reset_index()

    impossible_score = impossible.groupby('collection_name').agg({
        'answer': lambda x: (x == 'Answer is impossible.').mean()
    }).reset_index().rename(columns={'answer': 'impossible_score'})

    summary_df = pd.merge(summary_df, impossible_score, on='collection_name', how='left')

    summary_df = summary_df.rename(columns={'llm_gt_containment': 'llm_gt_containment_proportion'})

    return summary_df
evaluate_rag(out)

In [34]:
test_main = pd.read_parquet(f"{BASE_DIR}/output/main/test-001/data.parquet")
print(test_main.shape)
# test_main.head(2)

test_summary = pd.read_parquet(f"{BASE_DIR}/output/chunk_engineering/test-001/summary.parquet")
test_summary

(15, 12)


# Chunking Strategy

# Ranking accuracy

In [36]:
from eval import apply_llm_similarity_evaluation, apply_gt_containment_evaluation, fuzzmatch_llm_scoring, apply_context_similarity_evaluation

def mean_reciprocal_rank(df, k=5):
    """
    Calculate Mean Reciprocal Rank (MRR) for the top k retrieved results.
    :param df: DataFrame containing the relevant data (e.g., relevance scores, context).
    :param k: Number of top results to consider.
    :return: MRR score.
    """
    mrr_scores = []
    
    # Ensure that 'gt_relevance' is in the DataFrame passed to this function
    if 'gt_relevance' not in df.columns:
        raise KeyError("'gt_relevance' column is missing from the DataFrame.")
    
    for idx, row in df.iterrows():
        # Since 'gt_relevance' is expected to be an integer, just check its value directly
        relevance = row['gt_relevance']
        
        if relevance == 1:  # If the relevance score is 1, consider it relevant
            mrr_scores.append(1)  # Reciprocal rank for relevant document
        else:
            mrr_scores.append(0)  # If not relevant, reciprocal rank is 0.
    
    return sum(mrr_scores) / len(mrr_scores)

def precision_at_k(df, k=5):
    """
    Calculate Precision at k (P@k) for the top k retrieved results.
    :param df: DataFrame containing the relevant data (e.g., relevance scores, context).
    :param k: Number of top results to consider.
    :return: Precision at k score.
    """
    precision_scores = []
    
    # Ensure that 'gt_relevance' is in the DataFrame passed to this function
    if 'gt_relevance' not in df.columns:
        raise KeyError("'gt_relevance' column is missing from the DataFrame.")
    
    for idx, row in df.iterrows():
        # Get the top k relevance scores for each query (no need to iterate if it's just a binary score)
        top_k_relevance = row['gt_relevance']  # We expect a single value per row, not a list
        precision = top_k_relevance  # Just using the value directly since it's binary
        precision_scores.append(precision)
    
    return sum(precision_scores) / len(precision_scores)

def calculate_gt_relevance(df, threshold):
    """
    Calculate a relevance score based on the BLEU score or other similarity metrics.
    - If the BLEU score or other metric exceeds a threshold, it is marked as relevant (1).
    - Otherwise, it is marked as non-relevant (0).
    """
    # Ensure the 'embeddings_bleu_score' column exists
    if 'embeddings_bleu_score' not in df.columns:
        raise KeyError("'embeddings_bleu_score' column is missing from the DataFrame.")
    
    df['gt_relevance'] = df['embeddings_bleu_score'].apply(lambda x: 1 if x >= threshold else 0)
    return df

def evaluate_rag_with_ranking(df, k=5, threshold=0.1):  # Lower threshold
    is_possible = df[df.gt_is_impossible == False]
    impossible = df[df.gt_is_impossible == True]
    
    # Apply evaluations
    apply_llm_similarity_evaluation(is_possible, 'llm', 'gt_answers', 'answer')
    apply_gt_containment_evaluation(is_possible, 'gt_answers', 'answer')
    is_possible = fuzzmatch_llm_scoring(is_possible, threshold=80)
    apply_context_similarity_evaluation(is_possible, 'embeddings', 'gt_contexts', 'context')

    # Calculate gt_relevance based on the similarity score
    is_possible = calculate_gt_relevance(is_possible, threshold)

    # Initialize an empty list to collect results for each collection
    results = []

    # Group by collection_name to calculate per-collection metrics
    for collection, group in is_possible.groupby('collection_name'):
        # Calculate MRR and Precision at k for this collection
        mrr_score = mean_reciprocal_rank(group, k)
        precision_score = precision_at_k(group, k)

        # Create a dictionary to store the results for this collection
        collection_results = {
            'collection_name': collection,
            'Mean Reciprocal Rank (MRR)': mrr_score,
            'Precision at 5 (P@5)': precision_score,
            'llm_gt_containment_proportion': group['llm_gt_containment'].mean(),
            'llm_unigram_match': group['llm_unigram_match'].mean(),
            'llm_bigram_match': group['llm_bigram_match'].mean(),
            'llm_trigram_match': group['llm_trigram_match'].mean(),
            'llm_precision': group['llm_precision'].mean(),
            'llm_recall': group['llm_recall'].mean(),
            'llm_f1_score': group['llm_f1_score'].mean(),
            'llm_r1_recall': group['llm_r1_recall'].mean(),
            'llm_r1_precision': group['llm_r1_precision'].mean(),
            'llm_r1_f1': group['llm_r1_f1'].mean(),
            'embeddings_bleu_score': group['embeddings_bleu_score'].mean(),
            'embeddings_r2_recall': group['embeddings_r2_recall'].mean(),
            'embeddings_r2_precision': group['embeddings_r2_precision'].mean(),
            'embeddings_r2_f1': group['embeddings_r2_f1'].mean(),
            'time_taken': group['time_taken'].mean(),  # Include time taken for inference
        }

        # Calculate the impossible score for this collection
        impossible_subset = impossible[impossible['collection_name'] == collection]
        
        if not impossible_subset.empty:
            impossible_score = (impossible_subset['answer'] == 'Answer is impossible.').mean()
        else:
            impossible_score = 0.0  # If no impossible answers for this collection
        
        collection_results['impossible_score'] = impossible_score

        # Add this collection's results to the list
        results.append(collection_results)

    # Convert the list of results into a DataFrame
    summary_df = pd.DataFrame(results)

    return summary_df

out = evaluate_rag_with_ranking(test_main)

Average Unigram Match: 40%
Average Bigram Match: 31%
Average Trigram Match: 29%


In [40]:
# out.to_parquet(f"{BASE_DIR}/output/chunk_engineering/test-001/summary.parquet")

## Ground Truth Chunk 

In [65]:
import statistics
from embeddings import chunk_text

def calculate_max_distance(answer_starts):
    # Sort the answer start positions
    sorted_starts = sorted(answer_starts)
    
    # Calculate the differences between consecutive answer starts
    differences = [sorted_starts[i+1] - sorted_starts[i] for i in range(len(sorted_starts) - 1)]
    
    # Return the maximum distance
    return max(differences) if differences else 0  # Handle the case of only one answer

def find_gt_chunk(chunks, answer_starts):
    current_pos = 0
    valid_chunks = []  # To collect all valid chunks for different answer_start

    # Loop through all answer starts
    for answer_start in answer_starts:
        for chunk in chunks:
            chunk_length = len(chunk)

            if answer_start <= chunk_length:
                valid_chunks.append(chunk)  # If answer_start is within this chunk, keep it
                break  # No need to check further chunks for this answer_start
            else:
                current_pos += chunk_length

                if answer_start <= current_pos:
                    valid_chunks.append(chunk)  # Found the chunk with the answer_start
                    break

    return list(set(valid_chunks))  # Return all valid chunks that could match

question_count = 0
answer_count = 0
none_answer_count = 0
answer_lengths = []
max_distances_starts = []
question_token_lengths = []
answer_start_distances = []
question_answer_dict = {}

for document in data:
    document_title = document["title"]  
    for paragraph in document["paragraphs"]:
        context = paragraph["context"]

        token_count = 4096
        chunks = chunk_text(context, max_tokens=token_count)

        for qa in paragraph["qas"]:
            question = qa["question"]
            if len(question) > 3:
                question_count += 1
                question_token_lengths.append(len(question.split()))  # Token length of the question
            else:
                print(question)

            gt_is_impossible = qa.get("is_impossible", False)
            if not gt_is_impossible: 
                # Ensure gt_answers is always a list of strings, even if only one answer exists
                gt_answers = [answer["text"] for answer in qa["answers"]]
                gt_answer_starts = [answer["answer_start"] for answer in qa["answers"]] 
                answer_lengths.append(len(gt_answers))

                if len(gt_answers) > 10:
                    max_distances = calculate_max_distance(gt_answer_starts)
                    max_distances_starts.append(max_distances)

                # Store answers as a list of strings for each question
                question_answer_dict[question] = gt_answers
            
            try:
                valid_chunks = find_gt_chunk(chunks, gt_answer_starts)  
                if valid_chunks:
                    answer_count += 1
                    # Calculate distance from the start of the context to the answer start
                    answer_start_distance = statistics.mean(gt_answer_starts) if gt_answer_starts else 0
                    answer_start_distances.append(answer_start_distance)
                else:
                    if qa['is_impossible'] == False:
                        none_answer_count += 1
            except Exception as e:
                print(f"Error processing answer_start {gt_answer_starts}: {e}")
                none_answer_count += 1

# Statistical Analysis
print("Statistics:")
print(f"Question Count: {question_count}")
print(f"Answer Count: {answer_count}")
print(f"None Answer Count: {none_answer_count}")
print(f"Average Question Token Length: {statistics.mean(question_token_lengths)}")
print(f"Median Answer Length: {statistics.median(answer_lengths)}")
print(f"Max Distances (if > 10 answers): {max_distances_starts}")
print(f"Average Answer Start Distance from Context Start: {statistics.mean(answer_start_distances)}")

Statistics:
Question Count: 820
Answer Count: 820
None Answer Count: 0
Average Question Token Length: 42.02439024390244
Median Answer Length: 1
Max Distances (if > 10 answers): [1661, 2328, 47334, 814]
Average Answer Start Distance from Context Start: 21914.410053258147


## Word Frequency Analysis in Long Answer/Questions

In [120]:
import pandas as pd
pd.options.mode.chained_assignment = None
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json

questions = list(question_answer_dict.keys())
answers = [' '.join(answer) for answer in question_answer_dict.values()]
qa_df = pd.DataFrame({'question': questions, 'answer': answers})

In [122]:
questions = list(question_answer_dict.keys())
answers = list(question_answer_dict.values())  # Keep answers as lists

# Create a DataFrame for easier manipulation
qa_df = pd.DataFrame({'question': questions, 'answer': answers})
qa_df['answer_length'] = qa_df['answer'].apply(lambda x: len(x))

In [136]:
# Identify top 25% longest answers
long_answers = qa_df['answer_length'].quantile(0.75)
long_answer_df = qa_df[qa_df['answer_length'] > long_answers]
single_answer_df = qa_df[qa_df['answer_length'] == 1]

# Text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Apply preprocessing to questions
qa_df['preprocessed_question'] = qa_df['question'].apply(preprocess_text)
long_answer_df['preprocessed_question'] = long_answer_df['question'].apply(preprocess_text)
single_answer_df['preprocessed_question'] = single_answer_df['question'].apply(preprocess_text)

# Word frequency analysis
all_question_words = [word for words in qa_df['preprocessed_question'] for word in words]
long_answer_words = [word for words in long_answer_df['preprocessed_question'] for word in words]
single_answer_words = [word for words in single_answer_df['preprocessed_question'] for word in words]

all_question_freq = Counter(all_question_words)
long_answer_freq = Counter(long_answer_words)
single_answer_freq = Counter(single_answer_words)

# Get unique keywords for questions with long answers not in short answers
unique_question_keywords_long_answers = {word: count for word, count in long_answer_freq.items() if word not in single_answer_freq}
# Get unique keywords for questions with short answers not in long answers
unique_question_keywords_short_answers = {word: count for word, count in single_answer_freq.items() if word not in long_answer_freq}

# Sort the keywords by frequency in descending order
unique_question_keywords_long_answers = sorted(unique_question_keywords_long_answers.items(), key=lambda x: x[1], reverse=True)
unique_question_keywords_long_answers = [w[0] for w in unique_question_keywords_long_answers]
unique_question_keywords_short_answers = sorted(unique_question_keywords_short_answers.items(), key=lambda x: x[1], reverse=True)
unique_question_keywords_short_answers = [w[0] for w in unique_question_keywords_short_answers]

# Write the sorted keywords to JSON files
with open(f'{BASE_DIR}/steer/question_keywords_short_answers.json', 'w') as f:
    json.dump(unique_question_keywords_short_answers, f, indent=4)

with open(f'{BASE_DIR}/steer/question_keywords_long_answers.json', 'w') as f:
    json.dump(unique_question_keywords_long_answers, f, indent=4)

In [None]:
import numpy as np

max_answers = max(answer_lengths)  
avg_answers = np.mean(answer_lengths)  
median_answers = np.median(answer_lengths) 
num_questions = len(answer_lengths)  

# Answer Length Distribution (Histogram)
plt.hist(answer_lengths, bins=range(0, max_answers + 2), edgecolor='black')
plt.title('Distribution of Answer Lengths')
plt.xlabel('Number of Answers')
plt.ylabel('Frequency')
plt.show()

# Eval Discovery

## Text Similarity

In [None]:
from eval import evaluate_response

# IS POSSIBLE

def apply_text_similarity_evaluation(df,  prefix, expected_col, generated_col):
    df[[f'{prefix}_gt_containment', f'{prefix}_precision', f'{prefix}_recall', f'{prefix}_f1_score', f'{prefix}_bleu_score', f'{prefix}_r1_recall', f'{prefix}_r1_precision', f'{prefix}_r1_f1', f'{prefix}_r2_recall', f'{prefix}_r2_precision', f'{prefix}_r2_f1', f'{prefix}_rl_recall', f'{prefix}_rl_precision', f'{prefix}_rl_f1']] = df.apply(
        lambda x: pd.Series(evaluate_response(x[expected_col], x[generated_col])), axis=1
    )
    return df

"""
Where answer is_possible evaluate ground truth answers with answer
"""
apply_text_similarity_evaluation(is_possible, 'llm', 'gt_answers', 'answer')

"""
Where answer is_possible compare similarity of ground truth context with used context
"""
apply_text_similarity_evaluation(is_possible, 'embeddings', 'gt_contexts', 'context')

In [None]:
test = pd.read_parquet(f"{BASE_DIR}/output/chunk_engineering/method-001/summary.parquet")
test

In [None]:
qa_count = 0

for document in data[:1]:
    document_title = document["title"]  # Store document title for relevant retrieval
    for paragraph in document["paragraphs"][:2]:
        context = paragraph["context"]
        
        token_count = int(collection_name.split('_')[-1])
        chunks = chunk_text(context, max_tokens=token_count)

        for qa in paragraph["qas"][:5]:
            print(qa.get("is_impossible", False))
            qa_count+=1

print(qa_count)

## Fuzz match

In [None]:
"""
Where answer is_possible check if the answer is in the context 
"""

from rapidfuzz import fuzz
import re

def match_proportion(gt_answers, generated_answer, threshold=80):
    """Calculate the proportion of ground truth tokens found in the generated answer with fuzzy matching."""
    matches = 0
    
    # Tokenize and clean up gt_answers for partial match flexibility
    tokenized_gt_answers = [re.sub(r"[^\w\s]", "", answer).lower().split() for answer in gt_answers]
    generated_tokens = re.sub(r"[^\w\s]", "", generated_answer).lower().split()

    # For each set of tokens in gt_answers
    for answer_tokens in tokenized_gt_answers:
        token_match_count = 0
        
        for token in answer_tokens:
            # Check if each token from the ground truth answer has a fuzzy match in the generated answer
            match_found = False
            for gen_token in generated_tokens:
                similarity = fuzz.partial_ratio(token, gen_token)
                if similarity >= threshold:
                    token_match_count += 1
                    match_found = True
                    break  # Token matched, no need to check other generated tokens for this token
                
            if not match_found:
                print(f"Debug: '{token}' from '{' '.join(answer_tokens)}' not found in generated answer: {generated_answer}.")
        
        # If all tokens in the ground truth answer have fuzzy matches, count it as a match
        if token_match_count == len(answer_tokens):
            matches += 1
    
    # Calculate the proportion of gt_answers that met the similarity threshold
    return matches / len(gt_answers) if gt_answers else 0

# Calculate the percentage of ground truth answers present in the generated context and answer
is_possible['context_match_perc'] = is_possible.apply(lambda row: match_proportion(row['gt_answers'], row['context']), axis=1)
is_possible['answer_match_perc'] = is_possible.apply(lambda row: match_proportion(row['gt_answers'], row['answer']), axis=1)

# Calculate the overall percentage by averaging across rows
context_match_average = is_possible['context_match_perc'].mean() * 100
answer_match_average = is_possible['answer_match_perc'].mean() * 100

print(f"Average Ground Truth Answer in Generated Context: {context_match_average:.0f}%")
print(f"Average Ground Truth Answer in Generated Answer: {answer_match_average:.0f}%")

## Unigram, Bigram, Trigram Answer FuzzMatching

In [None]:
import re
import numpy as np
from rapidfuzz import fuzz
from sklearn.feature_extraction.text import CountVectorizer

def generate_ngrams(text, n=1):
    """Generate n-grams (unigrams, bigrams, trigrams) from text."""
    text = re.sub(r"[^\w\s]", "", text).lower()
    if not text or text == 'answer is impossible.' or not text.strip():  # Handle empty or invalid text
        return []  # Return empty if text is invalid
    # Create n-grams using CountVectorizer
    vectorizer = CountVectorizer(ngram_range=(n, n), analyzer='word', stop_words='english')
    try:
        # Fit the vectorizer to the text and transform it
        ngrams_matrix = vectorizer.fit_transform([text])
        # Get the list of n-grams (features)
        ngram_list = vectorizer.get_feature_names_out()
        return ngram_list
    except ValueError:
        return []  # Handle empty vocabulary error
    
def match_fuzzy_ngram(gt_answers, generated_answer, ngram_type='unigram', threshold=80):
    """Match fuzzy n-grams from generated_answer against ground truth answers."""
    matches = 0

    # Ensure gt_answers is a list or series and handle accordingly
    if isinstance(gt_answers, str):
        gt_answers = [gt_answers]  # Convert single string to a list
    elif isinstance(gt_answers, np.ndarray) or isinstance(gt_answers, pd.Series):
        gt_answers = gt_answers.tolist()  # Convert array or series to a list

    # Generate n-grams for the generated answer (unigrams, bigrams, trigrams)
    if ngram_type == 'unigram':
        ngrams = generate_ngrams(generated_answer, 1)
    elif ngram_type == 'bigram':
        ngrams = generate_ngrams(generated_answer, 2)
    elif ngram_type == 'trigram':
        ngrams = generate_ngrams(generated_answer, 3)

    # Check if ngrams is empty (for both list and numpy array types)
    if isinstance(ngrams, (list, np.ndarray)) and len(ngrams) == 0:
        return 0  # No matches if no n-grams are found

    for gt_answer in gt_answers:
        match_found = False
        gt_clean = re.sub(r"[^\w\s]", "", gt_answer).lower()

        for ngram in ngrams:
            similarity = fuzz.partial_ratio(gt_clean, ngram)
            if similarity >= threshold:
                match_found = True
                break  # We only need one fuzzy match for this gt_answer

        if match_found:
            matches += 1
        # else: # uncomment for debugging
        #     print(f"The term {gt_answer} was not found in generated {generated_answer}")

    # Check if there are any gt_answers
    if len(gt_answers) > 0:
        return matches / len(gt_answers)
    else:
        return 0

def fuzzmatch_llm_scoring(is_possible, threshold=80):
    """Evaluate match proportions for unigrams, bigrams, and trigrams."""
    # Evaluate for unigrams, bigrams, and trigrams
    is_possible['llm_unigram_match'] = is_possible.apply(lambda row: match_fuzzy_ngram(row['gt_answers'], row['answer'], ngram_type='unigram', threshold=threshold), axis=1)
    is_possible['llm_bigram_match'] = is_possible.apply(lambda row: match_fuzzy_ngram(row['gt_answers'], row['answer'], ngram_type='bigram', threshold=threshold), axis=1)
    is_possible['llm_trigram_match'] = is_possible.apply(lambda row: match_fuzzy_ngram(row['gt_answers'], row['answer'], ngram_type='trigram', threshold=threshold), axis=1)

    # Calculate average match rates
    unigram_match_avg = is_possible['llm_unigram_match'].mean() * 100
    bigram_match_avg = is_possible['llm_bigram_match'].mean() * 100
    trigram_match_avg = is_possible['llm_trigram_match'].mean() * 100

    print(f"Average Unigram Match: {unigram_match_avg:.0f}%")
    print(f"Average Bigram Match: {bigram_match_avg:.0f}%")
    print(f"Average Trigram Match: {trigram_match_avg:.0f}%")
    
    return is_possible

out = pd.read_parquet(f"{BASE_DIR}/output/main/method-001/data.parquet")
is_possible = out[out.gt_is_impossible == False]
fuzzmatch_llm_scoring(is_possible, threshold=80)

In [None]:
from rapidfuzz import fuzz
from sklearn.feature_extraction.text import CountVectorizer
import re

def generate_ngrams(text, n=1):
    """Generate n-grams (unigrams, bigrams, trigrams) from text."""
    # Clean up text
    text = re.sub(r"[^\w\s]", "", text).lower()
    # Create n-grams using CountVectorizer
    vectorizer = CountVectorizer(ngram_range=(n, n), analyzer='word')
    ngrams = vectorizer.fit_transform([text])
    ngram_list = vectorizer.get_feature_names_out()
    return ngram_list

def match_fuzzy_ngram(gt_answers, generated_answer, ngram_type='unigram', threshold=80):
    """Match fuzzy n-grams from generated_answer against ground truth answers."""
    matches = 0

    # Generate n-grams for the generated answer (unigrams, bigrams, trigrams)
    if ngram_type == 'unigram':
        ngrams = generate_ngrams(generated_answer, 1)
    elif ngram_type == 'bigram':
        ngrams = generate_ngrams(generated_answer, 2)
    elif ngram_type == 'trigram':
        ngrams = generate_ngrams(generated_answer, 3)

    for gt_answer in gt_answers:
        match_found = False
        # Clean and process each gt_answer for comparison
        gt_clean = re.sub(r"[^\w\s]", "", gt_answer).lower()

        # Check each n-gram in generated answer
        for ngram in ngrams:
            similarity = fuzz.partial_ratio(gt_clean, ngram)
            if similarity >= threshold:
                match_found = True
                break  # We only need one fuzzy match for this gt_answer

        if match_found:
            matches += 1
        else:
            print(f"Debug: '{gt_answer}' did not match any n-gram in generated answer: {generated_answer}.")
    
    return matches / len(gt_answers) if gt_answers else 0


def evaluate_matches(is_possible, threshold=80):
    """Evaluate match proportions for unigrams, bigrams, and trigrams."""
    # Evaluate for unigrams, bigrams, and trigrams
    is_possible['answer_unigram_match'] = is_possible.apply(lambda row: match_fuzzy_ngram(row['gt_answers'], row['answer'], ngram_type='unigram', threshold=threshold), axis=1)
    is_possible['answer_bigram_match'] = is_possible.apply(lambda row: match_fuzzy_ngram(row['gt_answers'], row['answer'], ngram_type='bigram', threshold=threshold), axis=1)
    is_possible['answer_trigram_match'] = is_possible.apply(lambda row: match_fuzzy_ngram(row['gt_answers'], row['answer'], ngram_type='trigram', threshold=threshold), axis=1)

    # Calculate average match rates
    unigram_match_avg = is_possible['answer_unigram_match'].mean() * 100
    bigram_match_avg = is_possible['answer_bigram_match'].mean() * 100
    trigram_match_avg = is_possible['answer_trigram_match'].mean() * 100

    print(f"Average Unigram Match: {unigram_match_avg:.0f}%")
    print(f"Average Bigram Match: {bigram_match_avg:.0f}%")
    print(f"Average Trigram Match: {trigram_match_avg:.0f}%")
    
    return is_possible


# Assuming 'is_possible' is your DataFrame
is_possible = evaluate_matches(is_possible, threshold=80)

# Impossible

In [None]:
# IMPOSSIBLE
"""
Where answer impossible evaluate that responses negate providing an answer
"""

impossible_accuracy = impossible[impossible.answer=='Answer is impossible.'].answer.count()/impossible.shape[0]
print(f"Accuracy in not answering impossible questions: {impossible_accuracy*100:.0f}%")

## Eval Summary

In [314]:
# Group by 'collection_name' and calculate the metrics
grouped = is_possible.groupby('collection_name')

# Initialize a dictionary to store results
additional_results = {
    'collection_name': [],
    'gt_answer_in_generated_context': [],
    'gt_answer_in_generated_answer': [],
    'accuracy_impossible': []
}

# Iterate through the grouped data
for collection_name, group in grouped:
    # Calculate ground truth in context
    group['correct_context'] = group.apply(lambda row: check_in_truth(row['gt_answers'], row['context']), axis=1)
    context_accuracy = group[group.correct_context == True].question.count() / group.shape[0]
    
    # Calculate ground truth in generated answer
    group['correct_answer'] = group.apply(lambda row: check_in_truth(row['gt_answers'], row['answer']), axis=1)
    answer_accuracy = group[group.correct_answer == True].question.count() / group.shape[0]
    
    # Calculate impossible accuracy
    impossible_group = group[group['answer'] == 'Answer is impossible.']
    impossible_accuracy = impossible_group.shape[0] / group.shape[0]
    
    # Append results to the dictionary
    additional_results['collection_name'].append(collection_name)
    additional_results['gt_answer_in_generated_context'].append(context_accuracy)
    additional_results['gt_answer_in_generated_answer'].append(answer_accuracy)
    additional_results['accuracy_impossible'].append(impossible_accuracy)

# Convert the results into a DataFrame
additional_df = pd.DataFrame(additional_results)

- ROUGE-1 (Unigrams): Evaluating keyword extraction or basic content overlap (e.g., detecting whether important terms are captured).
- ROUGE-2 (Bigrams): More sensitive to phrase overlaps and used in tasks like summarization where a model should understand multi-word concepts.
- ROUGE-L (LCS): Used when the preservation of order and structure is important, like in machine translation, sentence generation, or document summarization.
- ROUGE-W (Weighted LCS): When certain sections of the reference text (such as key information or longer subsequences) are more important than others, like in specialized summarization tasks.

In [None]:
# Group by 'collection_name' and calculate mean for most metrics, but proportion for llm_gt_containment
summary_df = is_possible.groupby('collection_name').agg({
    # 'llm_gt_containment': 'mean',  # This will give the proportion of True values
    'llm_precision': 'mean',
    'llm_recall': 'mean',
    'llm_f1_score': 'mean',
    'llm_r1_recall': 'mean',
    'llm_r1_precision': 'mean',
    'llm_r1_f1': 'mean',
    # 'embeddings_bleu_score': 'mean',
    # 'embeddings_r2_recall': 'mean',
    # 'embeddings_r2_precision': 'mean',
    # 'embeddings_r2_f1': 'mean'
}).reset_index()

# Rename llm_gt_containment to indicate it’s a proportion
summary_df = summary_df.rename(columns={'llm_gt_containment': 'llm_gt_containment_proportion'})

# Merge the additional_df with the existing summary_df (assuming summary_df is defined)
merged_df = pd.merge(summary_df, additional_df, on='collection_name')

merged_df

In [None]:
# Review answers

example =3

marked = is_possible.correct_answer.values[example]
generated = is_possible.answer.values[example]
expected = is_possible.gt_answers.values[example]

print("Marked answer:")
print(marked)
print("Generated Answer:")
print(generated)
print("Ground truth Answer:")
print(expected)


In [None]:
len(is_possible.context.values[0])

# Validating docs

In [None]:
temp = out[out.question.str.contains("Document Name")]
for idx, row in temp.iterrows():
    # Print the last 100 characters of each context for clarity
    print(f"Collection: {row['collection_name']}")
    print(f"GT Context: {row['gt_contexts'][-100:]}... -> Context: {row['context'][-100:]}")


In [None]:
out[['collection_name','llm_gt_containment']]

In [112]:
# import numpy as np

# def generate_fake_question_embedding(dim=1024):
#     return np.random.rand(dim)  # Fake random embedding

# results = collection.query(
#     query_embeddings=generate_fake_question_embedding(),
#     n_results=1,  # Return the closest match
#     where={"doc_id": document_title}  # Filter by document title
# )

# # Extract the best chunk text and its stored embedding from ChromaDB results
# best_chunk = results["documents"][0][0] if results["documents"] else None
# best_chunk_embeddings = collection.get(ids=results['ids'][0], include=['embeddings'])['embeddings']

In [None]:
# view examples

example = 1
# print("ground truth")
# print(final_df['gt_context'].values[example][:100])
# print("best chunk")
# print(final_df['context'].values[example][:100])

print("ground truth")
print(out['gt_answers'].values[example][:100])
print("answer")
print(out['answer'].values[example][:100])

In [21]:
# # view examples

# example = 0
# # print("ground truth")
# # print(final_df['gt_context'].values[example][:100])
# # print("best chunk")
# # print(final_df['context'].values[example][:100])

# print("ground truth")
# print(final_df['gt_answers'].values[example][:100])
# print("answer")
# print(final_df['answer'].values[example][:100])

In [66]:
test_summary

Unnamed: 0,collection_name,llm_gt_containment_proportion,llm_unigram_match,llm_bigram_match,llm_trigram_match,llm_precision,llm_recall,llm_f1_score,llm_r1_recall,llm_r1_precision,llm_r1_f1,embeddings_bleu_score,embeddings_r2_recall,embeddings_r2_precision,embeddings_r2_f1,impossible_score
0,legal_docs_voiyage2_1024,0.25,0.4375,0.1875,0.1875,0.352823,0.341398,0.24852,0.006944,0.041667,0.011905,0.083991,0.058279,0.058675,0.057936,1.0
1,legal_docs_voiyage2_2048,0.5,0.4375,0.4375,0.375,0.423077,0.528898,0.39003,0.177083,0.166667,0.159524,0.108295,0.137127,0.085142,0.104801,1.0
2,legal_docs_voiyage2_512,0.25,0.3125,0.3125,0.3125,0.381757,0.362231,0.323533,0.166667,0.125,0.142857,0.094424,0.052529,0.04927,0.050125,1.0


In [None]:
from eval import evaluate_response, evaluate_bleu

"""
ROUGE Variants:

You have three ROUGE variants: ROUGE-1, ROUGE-2, and ROUGE-L. Each focuses on different aspects of text similarity.

ROUGE-1 (Unigram-based):
Evaluates the overlap of individual words between the generated and reference texts.
ROUGE-2 (Bigram-based):
Assesses the overlap of pairs of consecutive words (bigrams).
ROUGE-L (Longest Common Subsequence):
Measures the longest common subsequence of words, emphasizing fluency and word order.

Metrics within Each ROUGE Variant:

R (Recall): Measures the proportion of reference text's units (unigrams, bigrams, or longest common subsequence) found in the generated text.
Higher is Better (up to 1.0)
P (Precision): Calculates the proportion of generated text's units that are also found in the reference text.
Higher is Better (up to 1.0)
F (F1 Score): The harmonic mean of Precision and Recall, offering a balanced measure.
Higher is Better (up to 1.0)
"""

evaluate_response(final_df['gt_context'].values[0],  final_df['context'].values[0])
# evaluate_bleu(final_df['gt_context'].values[0],  final_df['context'].values[0])


In [None]:
final_df.gt_context

In [None]:
final_df.context.values

In [None]:
# Filter for columns that contain scored data (e.g., metrics or embeddings)
score_cols = [c for c in evaluated_embeddings.columns if c.startswith('embeddings')]
mean_scores = evaluated_embeddings[score_cols].mean()

mean_scores

In [None]:
# Filter for columns that contain scored data (e.g., metrics or embeddings)
score_cols = [c for c in evaluated_embeddings.columns if c.startswith('llm')]
mean_scores = evaluated_answers[score_cols]
# .mean()

mean_scores

In [None]:
final_df

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
final_df['context_embedding'] = final_df['context_used'].apply(lambda x: model.encode(x))
final_df['gt_context_embedding'] = final_df['gt_context'].apply(lambda x: model.encode(x))

# Calculate cosine similarity
final_df['cosine_similarity'] = final_df.apply(
    lambda x: cosine_similarity(
        np.array(x['context_used_embedding']).reshape(1, -1),
        np.array(x['gt_context_embedding']).reshape(1, -1)
    )[0][0], axis=1
)
cosine_similarity_mean = final_df['cosine_similarity'].mean()
print(f"Mean Cosine Similarity: {cosine_similarity_mean}")

In [None]:
final_df[['question','answer','gt_answers']].values

In [None]:
# Function to retrieve answer using ChromaDB
def retrieve_answer(question, document_title):
    # Generate embedding for the question
    question_embedding = call_vo_embeddings([question])
    
    # Query ChromaDB for the most similar chunk, filtering by document title
    results = collection.query(
        query_embeddings=question_embedding.embeddings,
        n_results=1,  # Return the closest match
       where={"doc_id": document_title}  # Filter by documents by title
    )
    
    # Extract the best chunk text from ChromaDB results
    best_chunk = results["documents"][0][0] if results["documents"] else None

    if best_chunk:
        print(f"Current tokens: {count_tokens(best_chunk)}")
        
        # Construct the prompt for the LLM, including the question and the most relevant chunk
        prompt = f"Question: {question}\nContext: {best_chunk}\nAnswer:"
        
        temperature = 0.1
        max_tokens = 7000

        # Generate the answer from the LLM using the constructed prompt
        answer = call_claude(prompt, max_tokens, temperature)
        answer = answer[0].text
        
        return answer, best_chunk
    else:
        print(f"No relevant chunk found for document {document_title}.")
        return None, None

dfs = []

for document in data[:1]:
    for paragraph in document["paragraphs"][:1]:
        for qa in paragraph["qas"][:2]:
            question = qa["question"]
            document_title = document["title"]  # Pass document title to ensure you query the correct paper

            print(f"Document Title: {document_title}")

            answer, best_chunk = retrieve_answer(question, document_title)
            
            if answer and best_chunk:
                print(f"Question: {question}")
                print(f"Context: {best_chunk}")
                print("Answer:", answer)
            
                # Optionally, store the question and answer (e.g., in a dataframe or list)
                dfs.append({"document_title": document_title,"question": question, "context": best_chunk, "answer": answer, ground_truth})


In [None]:
for document in data[:2]:
    for paragraph in document["paragraphs"][:2]:
        for qa in paragraph["qas"][:2]:
            print(qa)

In [None]:
for document in data[:2]:
    for paragraph in document["paragraphs"][:2]:
        print(paragraph.keys())

In [39]:
df = pd.DataFrame(dfs)

In [None]:
df.values

In [None]:
document_title = "CerenceInc_20191002_8-K_EX-10.4_11827494_EX-10.4_Intellectual Property Agreement"
dummy_embedding = [0.0] * 1024  # This should match the dimension of your embeddings

# Query ChromaDB, filtering by document title (metadata.doc_id)
results = collection.query(
    query_embeddings=[dummy_embedding],  # You can use a dummy embedding here
    n_results=2,  # Limit to one result for testing
    where={"doc_id": document_title}
)

# Print the results for debugging
if results['documents']:
    print(f"Found relevant chunk: {results['documents'][0]}")
else:
    print("No matching chunks found.")

In [None]:
results

In [None]:
from transformers import AutoTokenizer
import chromadb

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Define a preprocessing function
def preprocess_text(text):
    # Tokenize and convert to lowercase
    tokens = tokenizer.tokenize(text.lower())
    return tokens

# Define a function to chunk text based on token length
def chunk_text(text, max_tokens=512):
    # Tokenize text
    tokens = preprocess_text(text)
    chunks = []
    
    # Ensure we split tokens into chunks of max_tokens size
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i + max_tokens-5]
        
        # Reconstruct the chunked text from tokens
        chunk_text = tokenizer.convert_tokens_to_string(chunk)

        chunked_token_length = len(tokenizer.tokenize(chunk_text))
        if chunked_token_length > max_tokens:
            print(chunked_token_length)
        
        chunks.append(chunk_text)
    
    return chunks

# Initialize ChromaDB client
client = chromadb.Client(Settings())

# Create a collection (like a table in a database)
collection_name = "legal_docs"
try:
    collection = client.create_collection(name=collection_name)
except Exception as e:
    collection = client.get_collection(name=collection_name)
    print(f"Collection '{collection_name}' already exists. Using the existing collection.")

MAX_TOKENS = 512 - max_question_length

# Populate ChromaDB with document chunks and embeddings
for doc_id, document in enumerate(data):
    for paragraph in document["paragraphs"]:
        context = paragraph["context"]
        chunked_text = chunk_text(context, MAX_TOKENS)  # Tokenize and chunk context text
        
        for i, chunk in enumerate(chunked_text):
            # Ensure the chunk ID is unique for each chunk
            if i < len(paragraph["qas"]):
                qa_id = paragraph["qas"][i]["id"]
            else:
                qa_id = f"{document['title']}_chunk_{i}"  # Ensure uniqueness per chunk

            # Get embeddings for the chunk
            embedding = embedding_model.encode(chunk)

            # Add the chunk and its embedding to ChromaDB
            print(f"Adding {qa_id} to collection.")
            collection.add(
                ids=[qa_id],  # Use the provided ID for each chunk
                documents=[chunk],
                metadatas=[{"doc_id": document["title"], "chunk_id": qa_id}],
                embeddings=[embedding]
            )

In [None]:
# Function to retrieve answer using ChromaDB
def retrieve_answer(question):
    # Generate embedding for the question
    question_embedding = call_vo_embeddings([question])
    
    # Query ChromaDB for the most similar chunk
    results = collection.query(
        query_embeddings=question_embedding.embeddings,
        n_results=1  # Return the closest match
    )
    
    # Extract the best chunk text from ChromaDB results
    best_chunk = results["documents"][0][0]

    print(f"Current tokens: {count_tokens(best_chunk)}")
    
    # Construct the prompt for the LLM, including the question and the most relevant chunk
    prompt = f"Question: {question}\nContext: {best_chunk}\nAnswer:"
    
    temperature = 0.1
    max_tokens = 7000

    # Generate the answer from the LLM using the constructed prompt
    answer = call_claude(prompt, max_tokens, temperature)
    
    return answer, best_chunk

# List to store answers (optional, for later use)
dfs = []

# Answer questions from JSON
for document in data[:1]:
    for paragraph in document["paragraphs"][:1]:
        for qa in paragraph["qas"][:1]:
            question = qa["question"]
            answer, best_chunk = retrieve_answer(question)
            print(f"Question: {question}")
            print(f"Context: {best_chunk}")
            print("Answer:", answer)
            
            # Optionally, store the question and answer (e.g., in a dataframe or list)
            dfs.append({"question": question, "context": best_chunk, "answer": answer})

# df = pd.concat()

In [None]:
for document in data[:1]:
    for paragraph in document["paragraphs"][:1]:
        for qa in paragraph["qas"][:10]:
            print(qa)

In [None]:
for document in data[:2]:
    print( document["paragraphs"])