In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from umap import UMAP

def create_animated_umap(embeddings, labels, n_neighbors=15, min_dist=0.1, n_components=3, metric='cosine', n_frames=100):
    # Normalize the embeddings
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)

    # Create UMAP reducer
    reducer = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric)

    # Fit UMAP
    umap_embeddings = reducer.fit_transform(scaled_embeddings)

    # Create interpolation between initial state and final UMAP embedding
    initial_state = scaled_embeddings[:, :n_components]  # Use first n_components of scaled embeddings
    embedding_list = [
        initial_state + (umap_embeddings - initial_state) * (i / (n_frames - 1))
        for i in range(n_frames)
    ]

    # Set up the figure and 3D axis
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')

    # Set style and color palette
    REMOVED_SECRET("dark_background")
    color_palette = sns.color_palette("plasma", n_colors=len(np.unique(labels)))

    # Create scatter plot
    scatter = ax.scatter(
        embedding_list[0][:, 0],
        embedding_list[0][:, 1],
        embedding_list[0][:, 2],
        c=labels,
        cmap=REMOVED_SECRET("plasma"),
        s=20
    )
    
    # Add title and labels
    title = ax.set_title("UMAP Embedding Progress (Frame 0)", fontsize=16, color='cyan')
    ax.set_xlabel("UMAP1", fontsize=12, color='magenta')
    ax.set_ylabel("UMAP2", fontsize=12, color='magenta')
    ax.set_zlabel("UMAP3", fontsize=12, color='magenta')

    # Add a color bar
    cbar = fig.colorbar(scatter, ax=ax, pad=0.1)
    cbar.set_label("Document Clusters", fontsize=12, color='yellow')

    # Set consistent axis limits
    all_embeddings = np.vstack(embedding_list)
    ax.set_xlim(all_embeddings[:, 0].min(), all_embeddings[:, 0].max())
    ax.set_ylim(all_embeddings[:, 1].min(), all_embeddings[:, 1].max())
    ax.set_zlim(all_embeddings[:, 2].min(), all_embeddings[:, 2].max())

    # Animation update function
    def update(frame):
        title.set_text(f"UMAP Embedding Progress (Frame {frame})")
        scatter._offsets3d = (embedding_list[frame][:, 0], 
                              embedding_list[frame][:, 1], 
                              embedding_list[frame][:, 2])
        return scatter, title

    # Create animation
    anim = animation.FuncAnimation(fig, update, frames=n_frames, interval=50, blit=False)

    # Save animation
    anim.save('umap_progress_animation.gif', writer='pillow', fps=30)

    plt.close(fig)

    print("UMAP progress animation saved as 'umap_progress_animation.gif'")



In [4]:

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from rouge_score import rouge_scorer
from REMOVED_SECRET import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
import os
import re
import fitz  # PyMuPDF
from collections import Counter
from difflib import SequenceMatcher
import numpy as np
from sklearn.cluster import KMeans
from collections import defaultdict
from REMOVED_SECRET import cosine_similarity
from RAG_UTILS import RAGSystem, EMBEDDING_MODEL_NAME, MODEL_ID, RERANKER_MODEL

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
RERANKER_MODEL = None
NUM_RETRIEVED_DOCS = 5

def clean_text(text: str) -> str:
    """Clean and normalize text."""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    return text.lower().strip()

def is_valid_answer(answer: str) -> bool:
    """Check if an answer is valid."""
    cleaned = clean_text(answer)
    return len(cleaned) > 1 and not cleaned.isdigit()  # Adjust criteria as needed

def extract_data_from_pdf(pdf_path):
    """
    Extract question, context, and answer from a PDF file using PyMuPDF.
    """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Regex patterns to extract information
    question_pattern = r"Question: (.*?)\n"
    context_pattern = r"Context:(.*?)(?=Question:|Answer:|$)"
    answer_pattern = r"Answer: (.*?)(?=Context:|$)"

    questions = re.findall(question_pattern, text)
    contexts = re.findall(context_pattern, text, re.DOTALL)
    answers = re.findall(answer_pattern, text, re.DOTALL)

    # Clean and pair the extracted data
    evaluation_data = []
    for q, c, a in zip(questions, contexts, answers):
        evaluation_data.append({
            "question": q.strip(),
            "context": c.strip(),
            "ground_truth": a.strip()
        })

    return evaluation_data

def calculate_perplexity(logits, input_ids):
    # Ensure logits and input_ids have the same sequence length
    seq_len = min(logits.size(1), input_ids.size(1))
    logits = logits[:, :seq_len, :]
    input_ids = input_ids[:, :seq_len]
    
    # Calculate loss
    loss_fct = REMOVED_SECRET(ignore_index=-100, reduction='none')
    loss = loss_fct(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    
    # Calculate perplexity
    return torch.exp(loss.mean())

def calculate_retrieval_accuracy(retrieved_docs, ground_truth_context, k=1):
    def preprocess_text(text):
        return ' '.join(text.lower().split())

    ground_truth_context = preprocess_text(ground_truth_context)
    
    relevant_docs = 0
    for doc in retrieved_docs[:k]:
        # Check if doc is a string or an object with page_content attribute
        if isinstance(doc, str):
            doc_text = preprocess_text(doc)
        else:
            doc_text = preprocess_text(doc.page_content)
        
        # Check for significant overlap
        similarity = SequenceMatcher(None, ground_truth_context, doc_text).ratio()
        if similarity > 0.5:  
            relevant_docs += 1
            break  # Stop after finding the first relevant document

    return relevant_docs / k

def calculate_bleu_score(reference, hypothesis):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference.split()], hypothesis.split(), 
                         weights=(0.25, 0.25, 0.25, 0.25), 
                         smoothing_function=smoothie)

def normalize_answer(text):
    """
    Normalize answer text while preserving important punctuation and structure.
    """
    # Convert to lowercase
    text = text.lower()
    
    # Replace multiple whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Preserve common punctuation that might be important for context
    text = re.sub(r'[^a-z0-9\s.,;:()"-]', '', text)
    
    # Normalize some common variations
    text = text.replace(' , ', ', ').replace(' . ', '. ')
    text = text.replace('( ', '(').replace(' )', ')')
    
    # Remove spaces before punctuation
    text = re.sub(r'\s([.,;:])', r'\1', text)
    
    return text


def exact_match_score(prediction, ground_truth):
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def evaluate_rag_system(rag_system, evaluation_data, pdf_folder_path):
    results = []
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    
    # Load and process documents
    raw_documents = REMOVED_SECRET(pdf_folder_path)
    processed_documents = REMOVED_SECRET(raw_documents)

    # Build vector database
    knowledge_index = rag_system.build_vector_database(processed_documents)
    
    for sample in tqdm(evaluation_data, desc="Evaluating samples"):
        question = sample['question']
        ground_truth = normalize_answer(sample['ground_truth'])
        context = sample['context']
        
        print(f"\n\nQuestion: {question}")
        print(f"Ground Truth: {ground_truth}")
        print(f"Original Context: {context[:200]}...")  # Print first 200 characters of context
        
        if not is_valid_answer(ground_truth):
            print(f"Warning: Invalid ground truth for question: {question}")
            continue
        
        # Get RAG system's answer and relevant documents
        answer, relevant_docs, logits = rag_system.answer_with_rag(question, knowledge_index)
        answer = normalize_answer(answer)

        # Calculate similarity scores
        question_embedding = REMOVED_SECRET(question)
        doc_embeddings = REMOVED_SECRET([doc.page_content if hasattr(doc, 'page_content') else str(doc) for doc in relevant_docs])
        similarity_scores = cosine_similarity([question_embedding], doc_embeddings)[0]
        
        print(f"Generated Answer: {answer}")
        print("Retrieved Documents:")
        for i, doc in enumerate(relevant_docs[:3], 1):  # Print top 3 retrieved documents
            print(f"Doc {i}: {doc[:200]}...")  # Print first 200 characters of each document
        
        # Calculate metrics
        bleu_score = calculate_bleu_score(ground_truth, answer)
        rouge_scores = rouge_scorer_instance.score(ground_truth, answer)
        retrieval_accuracy = calculate_retrieval_accuracy(relevant_docs, context)
        
        # BERT Score
        _, _, bert_f1 = bert_scorer.score([answer], [ground_truth])
        
        # Perplexity calculation
        input_ids = REMOVED_SECRET.encode(question + answer, return_tensors="pt").to(logits.device)
        perplexity = calculate_perplexity(logits, input_ids)
        
        # Additional metrics
        exact_match = exact_match_score(answer, ground_truth)
        f1 = f1_score(answer, ground_truth)
        
        print(f"BLEU Score: {bleu_score:.4f}")
        print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
        print(f"Retrieval Accuracy: {retrieval_accuracy:.4f}")
        print(f"BERT Score: {bert_f1.item():.4f}")
        print(f"Exact Match: {exact_match}")
        print(f"F1 Score: {f1:.4f}")
        
        results.append({
            'question': question,
            'ground_truth': ground_truth,
            'generated_answer': answer,
            'bleu_score': bleu_score,
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure,
            'retrieval_accuracy': retrieval_accuracy,
            'bert_score': bert_f1.item(),
            'perplexity': perplexity.item(),
            'exact_match': exact_match,
            'f1_score': f1,
            'relevant_docs': relevant_docs,
            'similarity_scores': similarity_scores.tolist()
        })
    
    return pd.DataFrame(results)
# Function to plot distribution of scores
def plot_score_distribution(data, score_name):
    plt.figure(figsize=(10, 6))
    sns.histplot(data[score_name], kde=True)
    plt.title(f'Distribution of {score_name}')
    plt.xlabel(score_name)
    plt.ylabel('Frequency')
    plt.show()

# Function for error analysis
def error_analysis(results):
    # Calculate absolute difference between BLEU score and 1
    results['bleu_error'] = 1 - results['bleu_score']
    
    # Sort by error and get top 10 worst predictions
    worst_predictions = results.sort_values('bleu_error', ascending=False).head(10)
    
    print("Top 10 Worst Predictions:")
    for _, row in worst_predictions.iterrows():
        print(f"Question: {row['question']}")
        print(f"Ground Truth: {row['ground_truth']}")
        print(f"Generated Answer: {row['generated_answer']}")
        print(f"BLEU Score: {row['bleu_score']}")
        print(f"F1 Score: {row['f1_score']}")
        print(f"Exact Match: {row['exact_match']}")
        print("--------------------")

    # Analyze error patterns
    error_patterns = {
        'short_answer': (results['generated_answer'].str.split().str.len() < 5).sum(),
        'long_answer': (results['generated_answer'].str.split().str.len() > 50).sum(),
        'low_bleu': (results['bleu_score'] < 0.1).sum(),
        'low_f1': (results['f1_score'] < 0.5).sum(),
        'no_exact_match': (results['exact_match'] == 0).sum()
    }
    
    print("\nError Patterns:")
    for pattern, count in error_patterns.items():
        print(f"{pattern}: {count}")


def prepare_umap_data(rag_system, pdf_directory):
    # Load and process documents
    raw_documents = REMOVED_SECRET(pdf_directory)
    processed_documents = REMOVED_SECRET(raw_documents)

    # Generate embeddings
    embeddings = REMOVED_SECRET([doc.page_content for doc in processed_documents])

    # Convert embeddings to numpy array
    embeddings_array = np.array(embeddings)

    # Create labels based on document sources
    source_to_label = defaultdict(lambda: len(source_to_label))
    labels = np.array([source_to_label[REMOVED_SECRET('source', 'unknown')] for doc in processed_documents])


    return embeddings_array, labels, processed_documents



# Main execution
if __name__ == "__main__":
    # Initialize RAG system
    rag_system = RAGSystem(
        embedding_model_name=EMBEDDING_MODEL_NAME,
        model_id=MODEL_ID,
        reranker_model=RERANKER_MODEL,
    )

    # Extract data from all PDFs in a directory
    pdf_directory = "local_database"  

    # Load and process documents
    raw_documents = REMOVED_SECRET(pdf_directory)
    processed_documents = REMOVED_SECRET(raw_documents)
 
    # Prepare data for UMAP visualization
    embeddings, labels, processed_documents = prepare_umap_data(rag_system, pdf_directory)
    

    # Create UMAP visualization
    create_animated_umap(embeddings, labels)
 
    all_evaluation_data = []
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = REMOVED_SECRET(pdf_directory, filename)
            all_evaluation_data.extend(extract_data_from_pdf(pdf_path))

    # Run evaluation
    evaluation_results = evaluate_rag_system(rag_system, all_evaluation_data, pdf_directory)

    # After evaluation, create visualizations for each result
    for i, result in enumerate(evaluation_results.itertuples()):
        question = result.question
        answer = result.generated_answer
        relevant_docs = result.relevant_docs  
        similarity_scores = result.similarity_scores 



    # Display some information about the UMAP visualization
    print(f"UMAP visualization created with {len(embeddings)} document chunks.")
    print(f"Number of unique labels: {len(np.unique(labels))}")
    print("Label distribution:")
    for label, count in zip(*np.unique(labels, return_counts=True)):
        print(f"  Label {label}: {count} chunks")
        
    # Display results
    print(evaluation_results.describe())

    # Calculate average scores for numeric columns only
    numeric_columns = evaluation_results.select_dtypes(include=[np.number]).columns
    average_scores = evaluation_results[numeric_columns].mean()
    print("\nAverage Scores:")
    print(average_scores)

    # Plot distributions for numeric columns
    for metric in numeric_columns:
        plot_score_distribution(evaluation_results, metric)

    # Correlation heatmap for numeric columns
    plt.figure(figsize=(12, 10))
    sns.heatmap(evaluation_results[numeric_columns].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap of Evaluation Metrics')
    plt.show()

    # Error analysis
    error_analysis(evaluation_results)

    # Display generated answers
    print("\nGenerated Answers:")
    for _, row in evaluation_results.iterrows():
        print(f"Question: {row['question']}")
        print(f"Ground Truth: {row['ground_truth']}")
        print(f"Generated Answer: {row['generated_answer']}")
        print(f"BLEU Score: {row['bleu_score']:.4f}")
        print(f"F1 Score: {row['f1_score']:.4f}")
        print(f"Exact Match: {row['exact_match']}")
        print("--------------------")

    # Clear memory
    rag_system.clear_memory()

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <REMOVED_SECRET object at 0x7fd5c2b838b0>>
Traceback (most recent call last):
  File "/home/obb/codes/langers/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Loading PDFs: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]


Documents: [Document(page_content='974 623 Gr 698 finding\n\nWhat is the capital of France? Paris!', metadata={'source': 'local_database/injected_output.pdf'})]


Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 382.62it/s]


Text splitting Sankey diagram saved as 'text_splitting_sankey.html'


Loading PDFs: 100%|██████████| 1/1 [00:00<00:00, 190.26it/s]


Documents: [Document(page_content='974 623 Gr 698 finding\n\nWhat is the capital of France? Paris!', metadata={'source': 'local_database/injected_output.pdf'})]


Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 1835.58it/s]

The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``REMOVED_SECRET()`` or ``pyplot.get_cmap()`` instead.


Attempting to set identical low and high xlims makes transformation singular; automatically expanding.


Attempting to set identical low and high ylims makes transformation singular; automatically expanding.


Attempting to set identical low and high zlims makes transformation singular; automatically expanding.



In [1]:
import torch
import numpy as np
from rouge_score import rouge_scorer
from REMOVED_SECRET import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
import re
import fitz  # PyMuPDF
from collections import Counter
from difflib import SequenceMatcher
from REMOVED_SECRET import cosine_similarity
# Import your RAG system
from RAG_UTILS import RAGSystem, EMBEDDING_MODEL_NAME, MODEL_ID, RERANKER_MODEL
from langchain.schema import Document
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
RERANKER_MODEL = None
NUM_RETRIEVED_DOCS = 5

def clean_text(text: str) -> str:
    """Clean and normalize text."""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower().strip()

def is_valid_answer(answer: str) -> bool:
    """Check if an answer is valid."""
    cleaned = clean_text(answer)
    return len(cleaned) > 1 and not cleaned.isdigit()

def extract_data_from_pdf(pdf_path):
    """Extract question and answer from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Split the text into question and answer
    parts = text.split('?')
    if len(parts) >= 2:
        question = parts[0].strip() + '?'
        answer = parts[1].strip()
        return {
            "question": question,
            "ground_truth": answer,
            "context": text  # Use the full text as context
        }
    else:
        return None

def calculate_perplexity(logits, input_ids):
    seq_len = min(logits.size(1), input_ids.size(1))
    logits = logits[:, :seq_len, :]
    input_ids = input_ids[:, :seq_len]
    
    loss_fct = REMOVED_SECRET(ignore_index=-100, reduction='none')
    loss = loss_fct(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    
    return torch.exp(loss.mean())

def calculate_retrieval_accuracy(retrieved_docs, ground_truth_context, k=1):
    def preprocess_text(text):
        return ' '.join(text.lower().split())

    ground_truth_context = preprocess_text(ground_truth_context)
    
    relevant_docs = 0
    for doc in retrieved_docs[:k]:
        doc_text = preprocess_text(doc.page_content if hasattr(doc, 'page_content') else str(doc))
        similarity = SequenceMatcher(None, ground_truth_context, doc_text).ratio()
        if similarity > 0.5:
            relevant_docs += 1
            break

    return relevant_docs / k

def calculate_bleu_score(reference, hypothesis):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference.split()], hypothesis.split(), 
                         weights=(0.25, 0.25, 0.25, 0.25), 
                         smoothing_function=smoothie)

def normalize_answer(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = re.sub(r'[^a-z0-9\s.,;:()"-]', '', text)
    text = text.replace(' , ', ', ').replace(' . ', '. ')
    text = text.replace('( ', '(').replace(' )', ')')
    text = re.sub(r'\s([.,;:])', r'\1', text)
    return text

def exact_match_score(prediction, ground_truth):
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def evaluate_rag_system(rag_system, sample, pdf_path):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    
    # Create a Document object from the raw text
    raw_documents = [Document(page_content=sample['context'], metadata={"source": pdf_path})]
    
    # Process documents
    processed_documents = REMOVED_SECRET(raw_documents)

    # Build vector database
    knowledge_index = rag_system.build_vector_database(processed_documents)
    
    question = sample['question']
    ground_truth = normalize_answer(sample['ground_truth'])
    context = sample['context']
    
    print(f"\nQuestion: {question}")
    print(f"Ground Truth: {ground_truth}")
    print(f"Original Context: {context}")
    
    if not is_valid_answer(ground_truth):
        print(f"Warning: Invalid ground truth for question: {question}")
        return None
    
    # Get RAG system's answer and relevant documents
    answer, relevant_docs, logits = rag_system.answer_with_rag(question, knowledge_index)
    answer = normalize_answer(answer)

    # Calculate similarity scores
    question_embedding = REMOVED_SECRET(question)
    doc_embeddings = REMOVED_SECRET([doc.page_content if hasattr(doc, 'page_content') else str(doc) for doc in relevant_docs])
    similarity_scores = cosine_similarity([question_embedding], doc_embeddings)[0]
    
    print(f"Generated Answer: {answer}")
    print("Retrieved Documents:")
    for i, doc in enumerate(relevant_docs[:3], 1):  # Print top 3 retrieved documents
        print(f"Doc {i}: {doc}")
    
    # Calculate metrics
    bleu_score = calculate_bleu_score(ground_truth, answer)
    rouge_scores = rouge_scorer_instance.score(ground_truth, answer)
    retrieval_accuracy = calculate_retrieval_accuracy(relevant_docs, context)
    
    # BERT Score
    _, _, bert_f1 = bert_scorer.score([answer], [ground_truth])
    
    # Perplexity calculation
    input_ids = REMOVED_SECRET.encode(question + answer, return_tensors="pt").to(logits.device)
    perplexity = calculate_perplexity(logits, input_ids)
    
    # Additional metrics
    exact_match = exact_match_score(answer, ground_truth)
    f1 = f1_score(answer, ground_truth)
    
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
    print(f"Retrieval Accuracy: {retrieval_accuracy:.4f}")
    print(f"BERT Score: {bert_f1.item():.4f}")
    print(f"Exact Match: {exact_match}")
    print(f"F1 Score: {f1:.4f}")
    
    return {
        'question': question,
        'ground_truth': ground_truth,
        'generated_answer': answer,
        'bleu_score': bleu_score,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'retrieval_accuracy': retrieval_accuracy,
        'bert_score': bert_f1.item(),
        'perplexity': perplexity.item(),
        'exact_match': exact_match,
        'f1_score': f1,
        'relevant_docs': relevant_docs,
        'similarity_scores': similarity_scores.tolist()
    }

# Main execution
if __name__ == "__main__":
    # Initialize RAG system
    rag_system = RAGSystem(
        embedding_model_name=EMBEDDING_MODEL_NAME,
        model_id=MODEL_ID,
        reranker_model=RERANKER_MODEL,
    )

    # Specify the path to your single PDF
    pdf_path = "test_one.pdf"  # Replace with your PDF file path if different

    # Extract data from the single PDF
    sample = extract_data_from_pdf(pdf_path)

    if sample:
        # Run evaluation
        result = evaluate_rag_system(rag_system, sample, pdf_path)

        if result:
            # Display results
            print("\nEvaluation Results:")
            for key, value in result.items():
                if key not in ['relevant_docs', 'similarity_scores']:
                    print(f"{key}: {value}")
        else:
            print("Evaluation failed.")
    else:
        print("Failed to extract data from the PDF.")

    # Clear memory
    rag_system.clear_memory()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['REMOVED_SECRET.bias', 'REMOVED_SECRET.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 612.84it/s]



Question: What is the capital of France?
Ground Truth: paris
Original Context: What is the capital of France? Paris! 

=> Retrieving documents...
Retrieved 1 documents.
Using 1 documents for answering.
=> Generating answer...
Generated Answer: paris context: document 2: the eiffel tower, located in paris, is one of the
Retrieved Documents:
Doc 1: What is the capital of France? Paris!
BLEU Score: 0.0176
ROUGE-L F1: 0.1333
Retrieval Accuracy: 1.0000
BERT Score: 0.1273
Exact Match: 0
F1 Score: 0.1333

Evaluation Results:
question: What is the capital of France?
ground_truth: paris
generated_answer: paris context: document 2: the eiffel tower, located in paris, is one of the
bleu_score: 0.01758542189440898
rouge1: 0.13333333333333333
rouge2: 0.0
rougeL: 0.13333333333333333
retrieval_accuracy: 1.0
bert_score: 0.1272696852684021
perplexity: 1419106.25
exact_match: 0
f1_score: 0.13333333333333333


In [1]:
import torch
import numpy as np
from rouge_score import rouge_scorer
from REMOVED_SECRET import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
import re
import fitz  # PyMuPDF
from collections import Counter
from difflib import SequenceMatcher
from REMOVED_SECRET import cosine_similarity
from langchain.schema import Document
# Import your RAG system
from RAG_UTILS import RAGSystem, EMBEDDING_MODEL_NAME, MODEL_ID, RERANKER_MODEL

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
RERANKER_MODEL = None
NUM_RETRIEVED_DOCS = 5

def clean_text(text: str) -> str:
    """Clean and normalize text."""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower().strip()

def is_valid_answer(answer: str) -> bool:
    """Check if an answer is valid."""
    cleaned = clean_text(answer)
    return len(cleaned) > 1 and not cleaned.isdigit()

def extract_data_from_pdf(pdf_path):
    """Extract question and answer from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Split the text into question and answer
    parts = text.split('?')
    if len(parts) >= 2:
        question = parts[0].strip() + '?'
        answer = parts[1].strip()
        return {
            "question": question,
            "ground_truth": answer,
            "context": text  # Use the full text as context
        }
    else:
        return None

def calculate_perplexity(logits, input_ids):
    seq_len = min(logits.size(1), input_ids.size(1))
    logits = logits[:, :seq_len, :]
    input_ids = input_ids[:, :seq_len]
    
    loss_fct = REMOVED_SECRET(ignore_index=-100, reduction='none')
    loss = loss_fct(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    
    # Debug: Print the loss before exponentiating
    print(f"Mean loss before exp: {loss.mean().item()}")
    
    # Debug: Print individual loss values
    print(f"Individual loss values: {loss.tolist()}")
    
    perplexity = torch.exp(loss.mean())
    return perplexity

def simple_perplexity(logits, input_ids):
    probs = F.softmax(logits, dim=-1)
    target_probs = probs.gather(-1, input_ids.unsqueeze(-1)).squeeze(-1)
    return torch.exp(-torch.log(target_probs).mean())

def normalize_logits(logits):
    return (logits - logits.mean()) / logits.std()

def evaluate_rag_system(rag_system, sample, pdf_path):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    
    # Create a Document object from the raw text
    raw_documents = [Document(page_content=sample['context'], metadata={"source": pdf_path})]
    
    # Process documents
    processed_documents = REMOVED_SECRET(raw_documents)

    # Build vector database
    knowledge_index = rag_system.build_vector_database(processed_documents)
    
    question = sample['question']
    ground_truth = normalize_answer(sample['ground_truth'])
    context = sample['context']
    
    print(f"\nQuestion: {question}")
    print(f"Ground Truth: {ground_truth}")
    print(f"Original Context: {context}")
    
    if not is_valid_answer(ground_truth):
        print(f"Warning: Invalid ground truth for question: {question}")
        return None
    
    # Get RAG system's answer and relevant documents
    answer, relevant_docs, logits = rag_system.answer_with_rag(question, knowledge_index)
    answer = normalize_answer(answer)

    # Calculate similarity scores
    question_embedding = REMOVED_SECRET(question)
    # Handle the case where relevant_docs are strings
    doc_contents = [doc if isinstance(doc, str) else doc.page_content for doc in relevant_docs]
    doc_embeddings = REMOVED_SECRET(doc_contents)
    similarity_scores = cosine_similarity([question_embedding], doc_embeddings)[0]
    
    print(f"Generated Answer: {answer}")
    print("Retrieved Documents:")
    for i, doc in enumerate(relevant_docs[:3], 1):  # Print top 3 retrieved documents
        doc_content = doc if isinstance(doc, str) else doc.page_content
        print(f"Doc {i}: {doc_content[:200]}...")  # Print first 200 characters of each document
    
    # Calculate metrics
    bleu_score = calculate_bleu_score(ground_truth, answer)
    rouge_scores = rouge_scorer_instance.score(ground_truth, answer)
    retrieval_accuracy = calculate_retrieval_accuracy(relevant_docs, context)
    
    # BERT Score
    _, _, bert_f1 = bert_scorer.score([answer], [ground_truth])
    
    # Perplexity calculation
    input_ids = REMOVED_SECRET.encode(question + answer, return_tensors="pt").to(logits.device)
    
    # Debug: Print shapes and ranges
    print(f"Logits shape: {logits.shape}, Input IDs shape: {input_ids.shape}")
    print(f"Logits min: {logits.min().item()}, max: {logits.max().item()}")
    print(f"Input IDs min: {input_ids.min().item()}, max: {input_ids.max().item()}")
    
    perplexity = calculate_perplexity(logits, input_ids)
    simple_ppl = simple_perplexity(logits, input_ids)
    normalized_logits = normalize_logits(logits)
    normalized_ppl = calculate_perplexity(normalized_logits, input_ids)
    
    print(f"Original Perplexity: {perplexity.item()}")
    print(f"Simple Perplexity: {simple_ppl.item()}")
    print(f"Perplexity with normalized logits: {normalized_ppl.item()}")
    
    # Additional metrics
    exact_match = exact_match_score(answer, ground_truth)
    f1 = f1_score(answer, ground_truth)
    
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
    print(f"Retrieval Accuracy: {retrieval_accuracy:.4f}")
    print(f"BERT Score: {bert_f1.item():.4f}")
    print(f"Exact Match: {exact_match}")
    print(f"F1 Score: {f1:.4f}")
    
    return {
        'question': question,
        'ground_truth': ground_truth,
        'generated_answer': answer,
        'bleu_score': bleu_score,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'retrieval_accuracy': retrieval_accuracy,
        'bert_score': bert_f1.item(),
        'perplexity': perplexity.item(),
        'simple_perplexity': simple_ppl.item(),
        'normalized_perplexity': normalized_ppl.item(),
        'exact_match': exact_match,
        'f1_score': f1,
        'relevant_docs': doc_contents,
        'similarity_scores': similarity_scores.tolist()
    }

# Main execution
if __name__ == "__main__":
    # Initialize RAG system
    rag_system = RAGSystem(
        embedding_model_name=EMBEDDING_MODEL_NAME,
        model_id=MODEL_ID,
        reranker_model=RERANKER_MODEL,
    )

    # Specify the path to your single PDF
    pdf_path = "test_one.pdf"  # Replace with your PDF file path if different

    # Extract data from the single PDF
    sample = extract_data_from_pdf(pdf_path)

    if sample:
        # Run evaluation
        result = evaluate_rag_system(rag_system, sample, pdf_path)

        if result:
            # Display results
            print("\nEvaluation Results:")
            for key, value in result.items():
                if key not in ['relevant_docs', 'similarity_scores']:
                    print(f"{key}: {value}")
        else:
            print("Evaluation failed.")
    else:
        print("Failed to extract data from the PDF.")

    # Clear memory
    rag_system.clear_memory()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['REMOVED_SECRET.bias', 'REMOVED_SECRET.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 613.38it/s]


NameError: name 'normalize_answer' is not defined

In [1]:
import torch
import REMOVED_SECRET as F
import numpy as np
from rouge_score import rouge_scorer
from REMOVED_SECRET import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
import re
import fitz  # PyMuPDF
from collections import Counter
from difflib import SequenceMatcher
from REMOVED_SECRET import cosine_similarity
from langchain.schema import Document
# Import your RAG system
from RAG_UTILS import RAGSystem, EMBEDDING_MODEL_NAME, MODEL_ID, RERANKER_MODEL

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
RERANKER_MODEL = None
NUM_RETRIEVED_DOCS = 5

def clean_text(text: str) -> str:
    """Clean and normalize text."""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower().strip()

def is_valid_answer(answer: str) -> bool:
    """Check if an answer is valid."""
    cleaned = clean_text(answer)
    return len(cleaned) > 1 and not cleaned.isdigit()

def normalize_answer(text):
    """
    Normalize answer text while preserving important punctuation and structure.
    """
    # Convert to lowercase
    text = text.lower()
    
    # Replace multiple whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Preserve common punctuation that might be important for context
    text = re.sub(r'[^a-z0-9\s.,;:()"-]', '', text)
    
    # Normalize some common variations
    text = text.replace(' , ', ', ').replace(' . ', '. ')
    text = text.replace('( ', '(').replace(' )', ')')
    
    # Remove spaces before punctuation
    text = re.sub(r'\s([.,;:])', r'\1', text)
    
    return text

def extract_data_from_pdf(pdf_path):
    """Extract question and answer from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Split the text into question and answer
    parts = text.split('?')
    if len(parts) >= 2:
        question = parts[0].strip() + '?'
        answer = parts[1].strip()
        return {
            "question": question,
            "ground_truth": answer,
            "context": text  # Use the full text as context
        }
    else:
        return None

def calculate_perplexity(logits, input_ids):
    seq_len = min(logits.size(1), input_ids.size(1))
    logits = logits[:, :seq_len, :]
    input_ids = input_ids[:, :seq_len]
    
    loss_fct = REMOVED_SECRET(ignore_index=-100, reduction='none')
    loss = loss_fct(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    
    # Debug: Print the loss before exponentiating
    print(f"Mean loss before exp: {loss.mean().item()}")
    
    # Debug: Print individual loss values
    print(f"Individual loss values: {loss.tolist()}")
    
    perplexity = torch.exp(loss.mean())
    return perplexity

def simple_perplexity(logits, input_ids):
    probs = F.softmax(logits, dim=-1)
    target_probs = probs.gather(-1, input_ids.unsqueeze(-1)).squeeze(-1)
    return torch.exp(-torch.log(target_probs).mean())

def normalize_logits(logits):
    return (logits - logits.mean()) / logits.std()

def calculate_bleu_score(reference, hypothesis):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference.split()], hypothesis.split(), 
                         weights=(0.25, 0.25, 0.25, 0.25), 
                         smoothing_function=smoothie)

def calculate_retrieval_accuracy(retrieved_docs, ground_truth_context, k=1):
    def preprocess_text(text):
        return ' '.join(text.lower().split())

    ground_truth_context = preprocess_text(ground_truth_context)
    
    relevant_docs = 0
    for doc in retrieved_docs[:k]:
        doc_text = preprocess_text(doc.page_content if hasattr(doc, 'page_content') else str(doc))
        similarity = SequenceMatcher(None, ground_truth_context, doc_text).ratio()
        if similarity > 0.5:
            relevant_docs += 1
            break

    return relevant_docs / k

def exact_match_score(prediction, ground_truth):
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def evaluate_rag_system(rag_system, sample, pdf_path):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    
    # Create a Document object from the raw text
    raw_documents = [Document(page_content=sample['context'], metadata={"source": pdf_path})]
    
    # Process documents
    processed_documents = REMOVED_SECRET(raw_documents)

    # Build vector database
    knowledge_index = rag_system.build_vector_database(processed_documents)
    
    question = sample['question']
    ground_truth = normalize_answer(sample['ground_truth'])
    context = sample['context']
    
    print(f"\nQuestion: {question}")
    print(f"Ground Truth: {ground_truth}")
    print(f"Original Context: {context}")
    
    if not is_valid_answer(ground_truth):
        print(f"Warning: Invalid ground truth for question: {question}")
        return None
    
    # Get RAG system's answer and relevant documents
    answer, relevant_docs, logits = rag_system.answer_with_rag(question, knowledge_index)
    answer = normalize_answer(answer)

    # Calculate similarity scores
    question_embedding = REMOVED_SECRET(question)
    # Handle the case where relevant_docs are strings
    doc_contents = [doc if isinstance(doc, str) else doc.page_content for doc in relevant_docs]
    doc_embeddings = REMOVED_SECRET(doc_contents)
    similarity_scores = cosine_similarity([question_embedding], doc_embeddings)[0]
    
    print(f"Generated Answer: {answer}")
    print("Retrieved Documents:")
    for i, doc in enumerate(relevant_docs[:3], 1):  # Print top 3 retrieved documents
        doc_content = doc if isinstance(doc, str) else doc.page_content
        print(f"Doc {i}: {doc_content[:200]}...")  # Print first 200 characters of each document
    
    # Calculate metrics
    bleu_score = calculate_bleu_score(ground_truth, answer)
    rouge_scores = rouge_scorer_instance.score(ground_truth, answer)
    retrieval_accuracy = calculate_retrieval_accuracy(relevant_docs, context)
    
    # BERT Score
    _, _, bert_f1 = bert_scorer.score([answer], [ground_truth])
    
    # Perplexity calculation
    input_ids = REMOVED_SECRET.encode(question + answer, return_tensors="pt").to(logits.device)
    
    # Debug: Print shapes and ranges
    print(f"Logits shape: {logits.shape}, Input IDs shape: {input_ids.shape}")
    print(f"Logits min: {logits.min().item()}, max: {logits.max().item()}")
    print(f"Input IDs min: {input_ids.min().item()}, max: {input_ids.max().item()}")
    
    perplexity = calculate_perplexity(logits, input_ids)
    simple_ppl = simple_perplexity(logits, input_ids)
    normalized_logits = normalize_logits(logits)
    normalized_ppl = calculate_perplexity(normalized_logits, input_ids)
    
    print(f"Original Perplexity: {perplexity.item()}")
    print(f"Simple Perplexity: {simple_ppl.item()}")
    print(f"Perplexity with normalized logits: {normalized_ppl.item()}")
    
    # Additional metrics
    exact_match = exact_match_score(answer, ground_truth)
    f1 = f1_score(answer, ground_truth)
    
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
    print(f"Retrieval Accuracy: {retrieval_accuracy:.4f}")
    print(f"BERT Score: {bert_f1.item():.4f}")
    print(f"Exact Match: {exact_match}")
    print(f"F1 Score: {f1:.4f}")
    
    return {
        'question': question,
        'ground_truth': ground_truth,
        'generated_answer': answer,
        'bleu_score': bleu_score,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'retrieval_accuracy': retrieval_accuracy,
        'bert_score': bert_f1.item(),
        'perplexity': perplexity.item(),
        'simple_perplexity': simple_ppl.item(),
        'normalized_perplexity': normalized_ppl.item(),
        'exact_match': exact_match,
        'f1_score': f1,
        'relevant_docs': doc_contents,
        'similarity_scores': similarity_scores.tolist()
    }

# Main execution
if __name__ == "__main__":
    # Initialize RAG system
    rag_system = RAGSystem(
        embedding_model_name=EMBEDDING_MODEL_NAME,
        model_id=MODEL_ID,
        reranker_model=RERANKER_MODEL,
    )

    # Specify the path to your single PDF
    pdf_path = "test_one.pdf"  # Replace with your PDF file path if different

    # Extract data from the single PDF
    sample = extract_data_from_pdf(pdf_path)

    if sample:
        # Run evaluation
        result = evaluate_rag_system(rag_system, sample, pdf_path)

        if result:
            # Display results
            print("\nEvaluation Results:")
            for key, value in result.items():
                if key not in ['relevant_docs', 'similarity_scores']:
                    print(f"{key}: {value}")
        else:
            print("Evaluation failed.")
    else:
        print("Failed to extract data from the PDF.")

    # Clear memory
    rag_system.clear_memory()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['REMOVED_SECRET.bias', 'REMOVED_SECRET.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 239.70it/s]



Question: What is the capital of France?
Ground Truth: paris
Original Context: What is the capital of France? Paris! 

=> Retrieving documents...
Retrieved 1 documents.
Using 1 documents for answering.
=> Generating answer...
Generated Answer: paris. document 2: context: the eiffel tower, located in paris, france
Retrieved Documents:
Doc 1: What is the capital of France? Paris!...
Logits shape: torch.Size([1, 118, 32064]), Input IDs shape: torch.Size([1, 29])
Logits min: -22.0625, max: 58.09375
Input IDs min: 275, max: 29973
Mean loss before exp: 13.365091323852539
Individual loss values: [10.752317428588867, 8.864455223083496, 3.5212817192077637, 17.61346435546875, 5.729909896850586, 10.233869552612305, 12.044573783874512, 16.271759033203125, 15.42618465423584, 15.754791259765625, 14.414917945861816, 9.427671432495117, 11.575547218322754, 10.462754249572754, 12.07187557220459, 7.998154640197754, 15.473542213439941, 15.969281196594238, 22.405630111694336, 18.194625854492188, 16.787139

In [1]:
import torch
import REMOVED_SECRET as F
import numpy as np
from rouge_score import rouge_scorer
from REMOVED_SECRET import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
import re
import fitz  # PyMuPDF
from collections import Counter
from difflib import SequenceMatcher
from REMOVED_SECRET import cosine_similarity
from langchain.schema import Document
# Import your RAG system
from RAG_UTILS import RAGSystem, EMBEDDING_MODEL_NAME, MODEL_ID, RERANKER_MODEL

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
RERANKER_MODEL = None
NUM_RETRIEVED_DOCS = 5

def clean_text(text: str) -> str:
    """Clean and normalize text."""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower().strip()

def is_valid_answer(answer: str) -> bool:
    """Check if an answer is valid."""
    cleaned = clean_text(answer)
    return len(cleaned) > 1 and not cleaned.isdigit()

def normalize_answer(text):
    """Normalize answer for more lenient comparison."""
    text = re.sub(r'[^\w\s]', '', text.lower())
    return ' '.join(text.split())

def extract_data_from_pdf(pdf_path):
    """Extract question and answer from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Split the text into question and answer
    parts = text.split('?')
    if len(parts) >= 2:
        question = parts[0].strip() + '?'
        answer = parts[1].strip()
        return {
            "question": question,
            "ground_truth": answer,
            "context": text  # Use the full text as context
        }
    else:
        return None

def calculate_perplexity(logits, input_ids):
    seq_len = min(logits.size(1), input_ids.size(1))
    logits = logits[:, :seq_len, :]
    input_ids = input_ids[:, :seq_len]
    
    loss_fct = REMOVED_SECRET(ignore_index=-100, reduction='none')
    loss = loss_fct(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    
    # Debug: Print the loss before exponentiating
    print(f"Mean loss before exp: {loss.mean().item()}")
    
    # Debug: Print individual loss values
    print(f"Individual loss values: {loss.tolist()}")
    
    perplexity = torch.exp(loss.mean())
    return perplexity


def improved_perplexity(logits, input_ids, ignore_index=-100):
    # Flatten the tensors
    logits = logits.view(-1, logits.size(-1))
    input_ids = input_ids.view(-1)
    
    # Create a mask for non-ignored indices
    mask = (input_ids != ignore_index).float()
    
    # Calculate log probabilities
    log_probs = F.log_softmax(logits, dim=-1)
    
    # Gather the log probabilities of the correct tokens
    target_log_probs = log_probs.gather(1, input_ids.unsqueeze(1)).squeeze(1)
    
    # Apply the mask and calculate the mean negative log likelihood
    nll = -(target_log_probs * mask).sum() / mask.sum()
    
    # Calculate perplexity
    perplexity = torch.exp(nll)
    
    return perplexity

def simple_perplexity(logits, input_ids):
    probs = F.softmax(logits, dim=-1)
    target_probs = probs.gather(-1, input_ids.unsqueeze(-1)).squeeze(-1)
    return torch.exp(-torch.log(target_probs).mean())

def normalize_logits(logits):
    return (logits - logits.mean()) / logits.std()

def calculate_bleu_score(reference, hypothesis):
    smoothie = SmoothingFunction().method2
    return sentence_bleu([reference.split()], hypothesis.split(), 
                         weights=(0.5, 0.3, 0.2), 
                         smoothing_function=smoothie)

def calculate_retrieval_accuracy(retrieved_docs, ground_truth_context, k=1):
    def preprocess_text(text):
        return ' '.join(text.lower().split())

    ground_truth_context = preprocess_text(ground_truth_context)
    
    relevant_docs = 0
    for doc in retrieved_docs[:k]:
        doc_text = preprocess_text(doc.page_content if hasattr(doc, 'page_content') else str(doc))
        similarity = SequenceMatcher(None, ground_truth_context, doc_text).ratio()
        if similarity > 0.5:
            relevant_docs += 1
            break

    return relevant_docs / k

def word_f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def evaluate_rag_system(rag_system, sample, pdf_path):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    
    # Create a Document object from the raw text
    raw_documents = [Document(page_content=sample['context'], metadata={"source": pdf_path})]
    
    # Process documents
    processed_documents = REMOVED_SECRET(raw_documents)

    # Build vector database
    knowledge_index = rag_system.build_vector_database(processed_documents)
    
    question = sample['question']
    ground_truth = normalize_answer(sample['ground_truth'])
    context = sample['context']
    
    print(f"\nQuestion: {question}")
    print(f"Ground Truth: {ground_truth}")
    print(f"Original Context: {context}")
    
    if not is_valid_answer(sample['ground_truth']):
        print(f"Warning: Invalid ground truth for question: {question}")
        return None
    
    # Get RAG system's answer and relevant documents
    answer, relevant_docs, logits = rag_system.answer_with_rag(question, knowledge_index)
    normalized_answer = normalize_answer(answer)

    # Calculate similarity scores
    question_embedding = REMOVED_SECRET(question)
    # Handle the case where relevant_docs are strings
    doc_contents = [doc if isinstance(doc, str) else doc.page_content for doc in relevant_docs]
    doc_embeddings = REMOVED_SECRET(doc_contents)
    similarity_scores = cosine_similarity([question_embedding], doc_embeddings)[0]
    
    print(f"Generated Answer: {answer}")
    print("Retrieved Documents:")
    for i, doc in enumerate(relevant_docs[:3], 1):  # Print top 3 retrieved documents
        doc_content = doc if isinstance(doc, str) else doc.page_content
        print(f"Doc {i}: {doc_content[:200]}...")  # Print first 200 characters of each document
    
    # Calculate metrics
    bleu_score = calculate_bleu_score(ground_truth, normalized_answer)
    rouge_scores = rouge_scorer_instance.score(ground_truth, normalized_answer)
    retrieval_accuracy = calculate_retrieval_accuracy(relevant_docs, context)
    
    # BERT Score
    _, _, bert_f1 = bert_scorer.score([normalized_answer], [ground_truth])
    
    # Perplexity calculation
    input_ids = REMOVED_SECRET.encode(question + answer, return_tensors="pt").to(logits.device)
    
    # Debug: Print shapes and ranges
    print(f"Logits shape: {logits.shape}, Input IDs shape: {input_ids.shape}")
    print(f"Logits min: {logits.min().item()}, max: {logits.max().item()}")
    print(f"Input IDs min: {input_ids.min().item()}, max: {input_ids.max().item()}")
    
    perplexity = calculate_perplexity(logits, input_ids)
    improved_ppl = improved_perplexity(logits, input_ids)
    
    simple_ppl = simple_perplexity(logits, input_ids)
    normalized_logits = normalize_logits(logits)
    normalized_ppl = calculate_perplexity(normalized_logits, input_ids)
    
    print(f"Original Perplexity: {perplexity.item()}")
    print(f"Simple Perplexity: {simple_ppl.item()}")
    print(f"Perplexity with normalized logits: {normalized_ppl.item()}")
    print(f"Improved Perplexity: {improved_ppl.item()}")
    # Additional metrics
    exact_match = ground_truth == normalized_answer
    word_f1 = word_f1_score(answer, sample['ground_truth'])
    
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
    print(f"Retrieval Accuracy: {retrieval_accuracy:.4f}")
    print(f"BERT Score: {bert_f1.item():.4f}")
    print(f"Exact Match: {exact_match}")
    print(f"Word F1 Score: {word_f1:.4f}")
    print(f"Logits mean: {logits.mean().item()}, std: {logits.std().item()}")
    print(f"Top 5 most likely tokens:")
    top_tokens = torch.topk(F.softmax(logits.view(-1, logits.size(-1)), dim=-1), k=5, dim=-1)
    for i, (prob, idx) in enumerate(zip(top_tokens.values[0], top_tokens.indices[0])):
        token = REMOVED_SECRET.decode([idx])
        print(f"  {i+1}. '{token}' (probability: {prob.item():.4f})")
    return {
        'question': question,
        'ground_truth': sample['ground_truth'],
        'generated_answer': answer,
        'bleu_score': bleu_score,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'retrieval_accuracy': retrieval_accuracy,
        'bert_score': bert_f1.item(),
        'perplexity': perplexity.item(),
        'simple_perplexity': simple_ppl.item(),
        'normalized_perplexity': normalized_ppl.item(),
        'improved_perplexity': improved_ppl.item(),
        'exact_match': int(exact_match),
        'word_f1_score': word_f1,
        'relevant_docs': doc_contents,
        'similarity_scores': similarity_scores.tolist()
    }

# Main execution
if __name__ == "__main__":
    # Initialize RAG system
    rag_system = RAGSystem(
        embedding_model_name=EMBEDDING_MODEL_NAME,
        model_id=MODEL_ID,
        reranker_model=RERANKER_MODEL,
    )

    # Specify the path to single PDF
    pdf_path = "test_one.pdf"  

    # Extract data from the single PDF
    sample = extract_data_from_pdf(pdf_path)

    if sample:
        # Run evaluation
        result = evaluate_rag_system(rag_system, sample, pdf_path)

        if result:
            # Display results
            print("\nEvaluation Results:")
            for key, value in result.items():
                if key not in ['relevant_docs', 'similarity_scores']:
                    print(f"{key}: {value}")
        else:
            print("Evaluation failed.")
    else:
        print("Failed to extract data from the PDF.")

    # Clear memory
    rag_system.clear_memory()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['REMOVED_SECRET.bias', 'REMOVED_SECRET.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 465.41it/s]



Question: What is the capital of France?
Ground Truth: paris
Original Context: What is the capital of France? Paris! 

=> Retrieving documents...
Retrieved 1 documents.
Using 1 documents for answering.
=> Generating answer...
Generated Answer: Paris.


Document 2:
Context: The Eiffel Tower, located on the Champ de
Retrieved Documents:
Doc 1: What is the capital of France? Paris!...
Logits shape: torch.Size([1, 118, 32064]), Input IDs shape: torch.Size([1, 31])
Logits min: -25.765625, max: 58.09375
Input IDs min: 13, max: 29973
Mean loss before exp: 11.982036590576172
Individual loss values: [10.752317428588867, 8.864455223083496, 3.5212817192077637, 17.61346435546875, 5.729909896850586, 10.233869552612305, 12.044573783874512, 15.271758079528809, 15.42618465423584, 15.754791259765625, 10.539917945861816, 11.240171432495117, 6.575547218322754, 7.431504249572754, 3.728126049041748, 10.404404640197754, 13.645417213439941, 5.62553071975708, 11.077505111694336, 0.022750791162252426, 4.81838

In [1]:
import torch
import REMOVED_SECRET as F
import numpy as np
from rouge_score import rouge_scorer
from REMOVED_SECRET import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
import re
import fitz  # PyMuPDF
from collections import Counter
from difflib import SequenceMatcher
from REMOVED_SECRET import cosine_similarity
from langchain.schema import Document
# Import your RAG system
from RAG_UTILS import RAGSystem, EMBEDDING_MODEL_NAME, MODEL_ID, RERANKER_MODEL

MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
RERANKER_MODEL = None
NUM_RETRIEVED_DOCS = 5

def clean_text(text: str) -> str:
    """Clean and normalize text."""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower().strip()

def is_valid_answer(answer: str) -> bool:
    """Check if an answer is valid."""
    cleaned = clean_text(answer)
    return len(cleaned) > 1 and not cleaned.isdigit()

def normalize_answer(text):
    """Normalize answer for more lenient comparison."""
    text = re.sub(r'[^\w\s]', '', text.lower())
    return ' '.join(text.split())

def extract_data_from_pdf(pdf_path):
    """Extract question and answer from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Split the text into question and answer
    parts = text.split('?')
    if len(parts) >= 2:
        question = parts[0].strip() + '?'
        answer = parts[1].strip()
        return {
            "question": question,
            "ground_truth": answer,
            "context": text  # Use the full text as context
        }
    else:
        return None

def scale_logits(logits, temperature=1.0):
    return logits / temperature

def calculate_perplexity_with_scaling(logits, input_ids, temperature=1.0):
    scaled_logits = scale_logits(logits, temperature)
    log_probs = F.log_softmax(scaled_logits, dim=-1)
    target_log_probs = log_probs.gather(-1, input_ids.unsqueeze(-1)).squeeze(-1)
    return torch.exp(-target_log_probs.mean())

def calculate_perplexity_for_single_token(logits, correct_token_id):
    """
    Calculate perplexity for a single-token answer.
    """
    # Get the logits for the last token (the answer token)
    last_token_logits = logits[0, -1, :]
    
    # Calculate softmax probabilities
    probs = F.softmax(last_token_logits, dim=-1)
    
    # Get the probability of the correct token
    correct_prob = probs[correct_token_id].item()
    
    # Calculate perplexity
    perplexity = 1 / correct_prob if correct_prob > 0 else float('inf')
    
    return perplexity

def calculate_bleu_score(reference, hypothesis):
    smoothie = SmoothingFunction().method2
    return sentence_bleu([reference.split()], hypothesis.split(), 
                         weights=(0.5, 0.3, 0.2), 
                         smoothing_function=smoothie)

def calculate_retrieval_accuracy(retrieved_docs, ground_truth_context, k=1):
    def preprocess_text(text):
        return ' '.join(text.lower().split())

    ground_truth_context = preprocess_text(ground_truth_context)
    
    relevant_docs = 0
    for doc in retrieved_docs[:k]:
        doc_text = preprocess_text(doc if isinstance(doc, str) else doc.page_content)
        similarity = SequenceMatcher(None, ground_truth_context, doc_text).ratio()
        if similarity > 0.5:
            relevant_docs += 1
            break

    return relevant_docs / k

def word_f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def perplexity_sanity_check():
    vocab_size = 1000
    sequence_length = 10
    batch_size = 1
    
    logits = torch.full((batch_size, sequence_length, vocab_size), -100.0)
    correct_indices = torch.randint(0, vocab_size, (batch_size, sequence_length))
    logits.scatter_(-1, correct_indices.unsqueeze(-1), 100.0)
    
    input_ids = correct_indices
    
    #ppl = improved_perplexity(logits, input_ids)
    #print(f"Sanity check perplexity: {ppl.item()}")

def post_process_answer(answer: str) -> str:
    """
    Clean up the generated answer by removing specific phrases and unnecessary information.
    """
    # Remove "Answer:" prefix if present
    answer = re.sub(r'^Answer:\s*', '', answer, flags=re.IGNORECASE)
    
    # Remove phrases like "Based on the context" or "According to the document"
    answer = re.sub(r'(Based on|According to) (the|this) (context|document|passage|text)[,:]?\s*', '', answer, flags=re.IGNORECASE)
    
    # Remove phrases that reference the context or documents
    answer = re.sub(r'(The context|The document|The passage) (states|mentions|says|indicates) that\s*', '', answer, flags=re.IGNORECASE)
    
    # Remove any remaining mentions of "context" or "document" at the end of the answer
    answer = re.sub(r'\s+(Context:|Document:).*$', '', answer, flags=re.IGNORECASE | re.DOTALL)
    
    # Trim any trailing whitespace
    answer = answer.strip()
    
    return answer

def analyze_answer_logits(logits, tokenizer, answer):
    answer_token_id = tokenizer.encode(answer)[0]
    last_token_logits = logits[0, -1, :]
    answer_logit = last_token_logits[answer_token_id].item()
    
    # Get top 5 logits
    top_logits, top_indices = torch.topk(last_token_logits, 5)
    
    print(f"Logit for '{answer}': {answer_logit:.4f}")
    print("Top 5 logits and tokens:")
    for logit, idx in zip(top_logits, top_indices):
        token = tokenizer.decode([idx])
        print(f"  {token}: {logit.item():.4f}")

def evaluate_rag_system(rag_system, sample, pdf_path):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    
    raw_documents = [Document(page_content=sample['context'], metadata={"source": pdf_path})]
    processed_documents = REMOVED_SECRET(raw_documents)
    knowledge_index = rag_system.build_vector_database(processed_documents)
    
    question = sample['question']
    ground_truth = normalize_answer(sample['ground_truth'])
    context = sample['context']
    
    print(f"\nQuestion: {question}")
    print(f"Ground Truth: {ground_truth}")
    print(f"Original Context: {context[:200]}...")  # Print first 200 characters of context
    
    if not is_valid_answer(sample['ground_truth']):
        print(f"Warning: Invalid ground truth for question: {question}")
        return None
    
    answer, relevant_docs, logits = rag_system.answer_with_rag(question, knowledge_index)
    
    # Apply post-processing to the answer
    answer = post_process_answer(answer)
    
    normalized_answer = normalize_answer(answer)

    question_embedding = REMOVED_SECRET(question)
    doc_contents = [doc if isinstance(doc, str) else doc.page_content for doc in relevant_docs]
    doc_embeddings = REMOVED_SECRET(doc_contents)
    similarity_scores = cosine_similarity([question_embedding], doc_embeddings)[0]
    
    print(f"Generated Answer: {answer}")
    print("Retrieved Documents:")
    for i, doc in enumerate(relevant_docs[:3], 1):
        doc_content = doc if isinstance(doc, str) else doc.page_content
        print(f"Doc {i}: {doc_content[:200]}...")
    
    bleu_score = calculate_bleu_score(ground_truth, normalized_answer)
    rouge_scores = rouge_scorer_instance.score(ground_truth, normalized_answer)
    retrieval_accuracy = calculate_retrieval_accuracy(relevant_docs, context)
    
    _, _, bert_f1 = bert_scorer.score([normalized_answer], [ground_truth])
    
    input_ids = REMOVED_SECRET.encode(question + answer, return_tensors="pt").to(logits.device)
    
    print(f"Logits shape: {logits.shape}, Input IDs shape: {input_ids.shape}")
    print(f"Logits min: {logits.min().item()}, max: {logits.max().item()}")
    print(f"Input IDs min: {input_ids.min().item()}, max: {input_ids.max().item()}")
    
    single_token_perplexity = calculate_perplexity_for_single_token(logits, answer_tokens[0])
    
    for temp in [1.0, 2.0, 5.0, 10.0]:
        scaled_ppl = calculate_perplexity_with_scaling(logits, input_ids, temperature=temp)
        print(f"Perplexity with temperature {temp}: {scaled_ppl.item()}")
    
    print("Tokenizer check:")
    print("Question tokens:", REMOVED_SECRET.encode(question))
    print("Answer tokens:", REMOVED_SECRET.encode(answer))
    print("Tokenized question:", REMOVED_SECRET.tokenize(question))
    print("Tokenized answer:", REMOVED_SECRET.tokenize(answer))
    
    print("Model output check:")
    print("Model output type:", type(logits))
    print("Model output shape:", logits.shape)
    print("Sample of model output:", logits[0, 0, :10])  # First 10 values of the first token

    print("Top 5 most likely tokens:")
    top_tokens = torch.topk(F.softmax(logits.view(-1, logits.size(-1)), dim=-1), k=5, dim=-1)
    for i, (prob, idx) in enumerate(zip(top_tokens.values[0], top_tokens.indices[0])):
        token = REMOVED_SECRET.decode([idx])
        print(f"  {i+1}. '{token}' (probability: {prob.item():.4f})")
    
    exact_match = ground_truth == normalized_answer
    word_f1 = word_f1_score(answer, sample['ground_truth'])
    
    analyze_answer_logits(logits, REMOVED_SECRET, answer)

    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
    print(f"Retrieval Accuracy: {retrieval_accuracy:.4f}")
    print(f"BERT Score: {bert_f1.item():.4f}")
    print(f"Exact Match: {exact_match}")
    print(f"Word F1 Score: {word_f1:.4f}")
    print(f"Single-token Perplexity: {single_token_perplexity:.4f}")

    return {
        'question': question,
        'ground_truth': sample['ground_truth'],
        'generated_answer': answer,
        'bleu_score': bleu_score,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'retrieval_accuracy': retrieval_accuracy,
        'bert_score': bert_f1.item(),
        'single_token_perplexity': single_token_perplexity.item(),
        'exact_match': int(exact_match),
        'word_f1_score': word_f1,
        'relevant_docs': doc_contents,
        'similarity_scores': similarity_scores.tolist()
    }

# Main execution
if __name__ == "__main__":
    perplexity_sanity_check()

    rag_system = RAGSystem(
        embedding_model_name=EMBEDDING_MODEL_NAME,
        model_id=MODEL_ID,
        reranker_model=RERANKER_MODEL,
    )

    pdf_path = "test_one.pdf"  # Replace with your PDF file path if different
    sample = extract_data_from_pdf(pdf_path)

    if sample:
        result = evaluate_rag_system(rag_system, sample, pdf_path)

        if result:
            print("\nEvaluation Results:")
            for key, value in result.items():
                if key not in ['relevant_docs', 'similarity_scores']:
                    print(f"{key}: {value}")
        else:
            print("Evaluation failed.")
    else:
        print("Failed to extract data from the PDF.")

    rag_system.clear_memory()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['REMOVED_SECRET.bias', 'REMOVED_SECRET.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 374.63it/s]



Question: What is the capital of France?
Ground Truth: paris
Original Context: What is the capital of France? Paris! 
...
=> Retrieving documents...
Retrieved 1 documents.
Using 1 documents for answering.
=> Generating answer...
Generated Answer: Paris
Retrieved Documents:
Doc 1: What is the capital of France? Paris!...
Logits shape: torch.Size([1, 118, 32064]), Input IDs shape: torch.Size([1, 9])
Logits min: -23.75, max: 67.1875
Input IDs min: 275, max: 29973


NameError: name 'answer_tokens' is not defined

In [1]:
import torch
import REMOVED_SECRET as F
import numpy as np
from rouge_score import rouge_scorer
from REMOVED_SECRET import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
import re
import fitz  # PyMuPDF
from collections import Counter
from difflib import SequenceMatcher
from REMOVED_SECRET import cosine_similarity
from langchain.schema import Document
# Import your RAG system
from RAG_UTILS import RAGSystem, EMBEDDING_MODEL_NAME, MODEL_ID, RERANKER_MODEL

MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
RERANKER_MODEL = None
NUM_RETRIEVED_DOCS = 5

def clean_text(text: str) -> str:
    """Clean and normalize text."""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower().strip()

def is_valid_answer(answer: str) -> bool:
    """Check if an answer is valid."""
    cleaned = clean_text(answer)
    return len(cleaned) > 1 and not cleaned.isdigit()

def normalize_answer(text):
    """Normalize answer for more lenient comparison."""
    text = re.sub(r'[^\w\s]', '', text.lower())
    return ' '.join(text.split())

def extract_data_from_pdf(pdf_path):
    """Extract question and answer from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Split the text into question and answer
    parts = text.split('?')
    if len(parts) >= 2:
        question = parts[0].strip() + '?'
        answer = parts[1].strip()
        return {
            "question": question,
            "ground_truth": answer,
            "context": text  # Use the full text as context
        }
    else:
        return None

def post_process_answer(answer: str) -> str:
    """
    Clean up the generated answer by removing specific phrases and unnecessary information.
    """
    # Remove "Answer:" prefix if present
    answer = re.sub(r'^Answer:\s*', '', answer, flags=re.IGNORECASE)
    
    # Remove phrases like "Based on the context" or "According to the document"
    answer = re.sub(r'(Based on|According to) (the|this) (context|document|passage|text)[,:]?\s*', '', answer, flags=re.IGNORECASE)
    
    # Remove phrases that reference the context or documents
    answer = re.sub(r'(The context|The document|The passage) (states|mentions|says|indicates) that\s*', '', answer, flags=re.IGNORECASE)
    
    # Remove any remaining mentions of "context" or "document" at the end of the answer
    answer = re.sub(r'\s+(Context:|Document:).*$', '', answer, flags=re.IGNORECASE | re.DOTALL)
    
    # Trim any trailing whitespace
    answer = answer.strip()
    
    return answer

def calculate_perplexity_for_single_token(logits, correct_token_id):
    """
    Calculate perplexity for a single-token answer.
    """
    # Get the logits for the last token (the answer token)
    last_token_logits = logits[0, -1, :]
    
    # Calculate softmax probabilities
    probs = F.softmax(last_token_logits, dim=-1)
    
    # Get the probability of the correct token
    correct_prob = probs[correct_token_id].item()
    
    # Calculate perplexity
    perplexity = 1 / correct_prob if correct_prob > 0 else float('inf')
    
    return perplexity

def calculate_bleu_score(reference, hypothesis):
    smoothie = SmoothingFunction().method2
    return sentence_bleu([reference.split()], hypothesis.split(), 
                         weights=(0.5, 0.3, 0.2), 
                         smoothing_function=smoothie)

def calculate_retrieval_accuracy(retrieved_docs, ground_truth_context, k=1):
    def preprocess_text(text):
        return ' '.join(text.lower().split())

    ground_truth_context = preprocess_text(ground_truth_context)
    
    relevant_docs = 0
    for doc in retrieved_docs[:k]:
        doc_text = preprocess_text(doc if isinstance(doc, str) else doc.page_content)
        similarity = SequenceMatcher(None, ground_truth_context, doc_text).ratio()
        if similarity > 0.5:
            relevant_docs += 1
            break

    return relevant_docs / k

def word_f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def analyze_answer_logits(logits, tokenizer, answer):
    answer_token_id = tokenizer.encode(answer)[0]
    last_token_logits = logits[0, -1, :]
    answer_logit = last_token_logits[answer_token_id].item()
    
    # Get top 5 logits
    top_logits, top_indices = torch.topk(last_token_logits, 5)
    
    print(f"Logit for '{answer}': {answer_logit:.4f}")
    print("Top 5 logits and tokens:")
    for logit, idx in zip(top_logits, top_indices):
        token = tokenizer.decode([idx])
        print(f"  {token}: {logit.item():.4f}")

def evaluate_rag_system(rag_system, sample, pdf_path):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    
    raw_documents = [Document(page_content=sample['context'], metadata={"source": pdf_path})]
    processed_documents = REMOVED_SECRET(raw_documents)
    knowledge_index = rag_system.build_vector_database(processed_documents)
    
    question = sample['question']
    ground_truth = normalize_answer(sample['ground_truth'])
    context = sample['context']
    
    print(f"\nQuestion: {question}")
    print(f"Ground Truth: {ground_truth}")
    print(f"Original Context: {context[:200]}...")  # Print first 200 characters of context
    
    if not is_valid_answer(sample['ground_truth']):
        print(f"Warning: Invalid ground truth for question: {question}")
        return None
    
    answer, relevant_docs, logits = rag_system.answer_with_rag(question, knowledge_index)
    answer = post_process_answer(answer)
    normalized_answer = normalize_answer(answer)

    question_embedding = REMOVED_SECRET(question)
    doc_contents = [doc if isinstance(doc, str) else doc.page_content for doc in relevant_docs]
    doc_embeddings = REMOVED_SECRET(doc_contents)
    similarity_scores = cosine_similarity([question_embedding], doc_embeddings)[0]
    
    print(f"Generated Answer: {answer}")
    print("Retrieved Documents:")
    for i, doc in enumerate(relevant_docs[:3], 1):
        doc_content = doc if isinstance(doc, str) else doc.page_content
        print(f"Doc {i}: {doc_content[:200]}...")
    
    bleu_score = calculate_bleu_score(ground_truth, normalized_answer)
    rouge_scores = rouge_scorer_instance.score(ground_truth, normalized_answer)
    retrieval_accuracy = calculate_retrieval_accuracy(relevant_docs, context)
    
    _, _, bert_f1 = bert_scorer.score([normalized_answer], [ground_truth])
    
    answer_token_ids = REMOVED_SECRET.encode(answer)
    input_ids = REMOVED_SECRET.encode(question + answer, return_tensors="pt").to(logits.device)
    
    print(f"Logits shape: {logits.shape}, Input IDs shape: {input_ids.shape}")
    print(f"Logits min: {logits.min().item()}, max: {logits.max().item()}")
    print(f"Input IDs min: {input_ids.min().item()}, max: {input_ids.max().item()}")
    
    # Calculate perplexity for the single-token answer
    single_token_perplexity = calculate_perplexity_for_single_token(logits, answer_token_ids[0])
    print(f"Single-token Perplexity: {single_token_perplexity:.4f}")
    
    print("Tokenizer check:")
    print("Question tokens:", REMOVED_SECRET.encode(question))
    print("Answer tokens:", answer_token_ids)
    print("Tokenized question:", REMOVED_SECRET.tokenize(question))
    print("Tokenized answer:", REMOVED_SECRET.tokenize(answer))
    
    print("Model output check:")
    print("Model output type:", type(logits))
    print("Model output shape:", logits.shape)
    print("Sample of model output:", logits[0, 0, :10])  # First 10 values of the first token

    analyze_answer_logits(logits, REMOVED_SECRET, answer)
    
    exact_match = ground_truth == normalized_answer
    word_f1 = word_f1_score(answer, sample['ground_truth'])
    
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
    print(f"Retrieval Accuracy: {retrieval_accuracy:.4f}")
    print(f"BERT Score: {bert_f1.item():.4f}")
    print(f"Exact Match: {exact_match}")
    print(f"Word F1 Score: {word_f1:.4f}")
    
    return {
        'question': question,
        'ground_truth': sample['ground_truth'],
        'generated_answer': answer,
        'bleu_score': bleu_score,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'retrieval_accuracy': retrieval_accuracy,
        'bert_score': bert_f1.item(),
        'single_token_perplexity': single_token_perplexity,
        'exact_match': int(exact_match),
        'word_f1_score': word_f1,
        'relevant_docs': doc_contents,
        'similarity_scores': similarity_scores.tolist()
    }

# Main execution
if __name__ == "__main__":
    rag_system = RAGSystem(
        embedding_model_name=EMBEDDING_MODEL_NAME,
        model_id=MODEL_ID,
        reranker_model=RERANKER_MODEL,
    )

    pdf_path = "test_one.pdf"  # Replace with your PDF file path if different
    sample = extract_data_from_pdf(pdf_path)

    if sample:
        result = evaluate_rag_system(rag_system, sample, pdf_path)

        if result:
            print("\nEvaluation Results:")
            for key, value in result.items():
                if key not in ['relevant_docs', 'similarity_scores']:
                    print(f"{key}: {value}")
        else:
            print("Evaluation failed.")
    else:
        print("Failed to extract data from the PDF.")

    rag_system.clear_memory()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['REMOVED_SECRET.bias', 'REMOVED_SECRET.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 452.80it/s]



Question: What is the capital of France?
Ground Truth: paris
Original Context: What is the capital of France? Paris! 
...
=> Retrieving documents...
Retrieved 1 documents.
Using 1 documents for answering.
=> Generating answer...
Generated Answer: Paris
Retrieved Documents:
Doc 1: What is the capital of France? Paris!...
Logits shape: torch.Size([1, 118, 32064]), Input IDs shape: torch.Size([1, 9])
Logits min: -26.421875, max: 67.1875
Input IDs min: 275, max: 29973
Single-token Perplexity: 733752722283196.2500
Tokenizer check:
Question tokens: [1724, 338, 278, 7483, 310, 3444, 29973]
Answer tokens: [3681]
Tokenized question: ['▁What', '▁is', '▁the', '▁capital', '▁of', '▁France', '?']
Tokenized answer: ['▁Paris']
Model output check:
Model output type: <class 'torch.Tensor'>
Model output shape: torch.Size([1, 118, 32064])
Sample of model output: tensor([ 2.6836,  5.5234,  9.4297,  7.9688, 13.4531, 14.7656, 17.4062, 11.8047,
        10.0859, 12.8438], device='cuda:0')
Logit for 'Paris': 2

In [1]:
import torch
import REMOVED_SECRET as F
import numpy as np
from rouge_score import rouge_scorer
from REMOVED_SECRET import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
import re
import fitz  # PyMuPDF
from collections import Counter
from difflib import SequenceMatcher
from REMOVED_SECRET import cosine_similarity
from langchain.schema import Document
# Import your RAG system
from RAG_UTILS import RAGSystem, EMBEDDING_MODEL_NAME, MODEL_ID, RERANKER_MODEL

MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
RERANKER_MODEL = None
NUM_RETRIEVED_DOCS = 5

def extract_data_from_pdf(pdf_path):
    """Extract question and answer from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Split the text into question and answer
    parts = text.split('?')
    if len(parts) >= 2:
        question = parts[0].strip() + '?'
        answer = parts[1].strip()
        return {
            "question": question,
            "ground_truth": answer,
            "context": text  # Use the full text as context
        }
    else:
        return None

def calculate_perplexity_for_single_token(logits, correct_token_id):
    """
    Calculate perplexity for a single-token answer.
    """
    last_token_logits = logits[0, -1, :]
    probs = F.softmax(last_token_logits, dim=-1)
    correct_prob = probs[correct_token_id].item()
    perplexity = 1 / correct_prob if correct_prob > 0 else float('inf')
    return perplexity

def calculate_bleu_score(reference, hypothesis):
    smoothie = SmoothingFunction().method2
    return sentence_bleu([reference.split()], hypothesis.split(), 
                         weights=(0.5, 0.3, 0.2), 
                         smoothing_function=smoothie)

def calculate_retrieval_accuracy(retrieved_docs, ground_truth_context, k=1):
    relevant_docs = 0
    for doc in retrieved_docs[:k]:
        doc_text = doc if isinstance(doc, str) else doc.page_content
        similarity = SequenceMatcher(None, ground_truth_context, doc_text).ratio()
        if similarity > 0.5:
            relevant_docs += 1
            break
    return relevant_docs / k

def word_f1_score(prediction, ground_truth):
    prediction_tokens = prediction.split()
    ground_truth_tokens = ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def analyze_answer_logits(logits, tokenizer, answer):
    answer_token_id = tokenizer.encode(answer)[0]
    last_token_logits = logits[0, -1, :]
    answer_logit = last_token_logits[answer_token_id].item()
    
    top_logits, top_indices = torch.topk(last_token_logits, 5)
    
    print(f"Logit for '{answer}': {answer_logit:.4f}")
    print("Top 5 logits and tokens:")
    for logit, idx in zip(top_logits, top_indices):
        token = tokenizer.decode([idx])
        print(f"  {token}: {logit.item():.4f}")

def evaluate_rag_system(rag_system, sample, pdf_path):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    
    raw_documents = [Document(page_content=sample['context'], metadata={"source": pdf_path})]
    processed_documents = REMOVED_SECRET(raw_documents)
    knowledge_index = rag_system.build_vector_database(processed_documents)
    
    question = sample['question']
    ground_truth = sample['ground_truth']
    context = sample['context']
    
    print(f"\nQuestion: {question}")
    print(f"Ground Truth: {ground_truth}")
    print(f"Original Context: {context[:200]}...")  # Print first 200 characters of context
    
    answer, relevant_docs, logits = rag_system.answer_with_rag(question, knowledge_index)

    question_embedding = REMOVED_SECRET(question)
    doc_contents = [doc if isinstance(doc, str) else doc.page_content for doc in relevant_docs]
    doc_embeddings = REMOVED_SECRET(doc_contents)
    similarity_scores = cosine_similarity([question_embedding], doc_embeddings)[0]
    
    print(f"Generated Answer: {answer}")
    print("Retrieved Documents:")
    for i, doc in enumerate(relevant_docs[:3], 1):
        doc_content = doc if isinstance(doc, str) else doc.page_content
        print(f"Doc {i}: {doc_content[:200]}...")
    
    bleu_score = calculate_bleu_score(ground_truth, answer)
    rouge_scores = rouge_scorer_instance.score(ground_truth, answer)
    retrieval_accuracy = calculate_retrieval_accuracy(relevant_docs, context)
    
    _, _, bert_f1 = bert_scorer.score([answer], [ground_truth])
    
    answer_token_ids = REMOVED_SECRET.encode(answer)
    input_ids = REMOVED_SECRET.encode(question + answer, return_tensors="pt").to(logits.device)
    
    print(f"Logits shape: {logits.shape}, Input IDs shape: {input_ids.shape}")
    print(f"Logits min: {logits.min().item()}, max: {logits.max().item()}")
    print(f"Input IDs min: {input_ids.min().item()}, max: {input_ids.max().item()}")
    
    single_token_perplexity = calculate_perplexity_for_single_token(logits, answer_token_ids[0])
    print(f"Single-token Perplexity: {single_token_perplexity:.4f}")
    
    print("Tokenizer check:")
    print("Question tokens:", REMOVED_SECRET.encode(question))
    print("Answer tokens:", answer_token_ids)
    print("Tokenized question:", REMOVED_SECRET.tokenize(question))
    print("Tokenized answer:", REMOVED_SECRET.tokenize(answer))
    
    print("Model output check:")
    print("Model output type:", type(logits))
    print("Model output shape:", logits.shape)
    print("Sample of model output:", logits[0, 0, :10])  # First 10 values of the first token

    analyze_answer_logits(logits, REMOVED_SECRET, answer)
    
    exact_match = ground_truth == answer
    word_f1 = word_f1_score(answer, sample['ground_truth'])
    
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
    print(f"Retrieval Accuracy: {retrieval_accuracy:.4f}")
    print(f"BERT Score: {bert_f1.item():.4f}")
    print(f"Exact Match: {exact_match}")
    print(f"Word F1 Score: {word_f1:.4f}")
    
    return {
        'question': question,
        'ground_truth': sample['ground_truth'],
        'generated_answer': answer,
        'bleu_score': bleu_score,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'retrieval_accuracy': retrieval_accuracy,
        'bert_score': bert_f1.item(),
        'single_token_perplexity': single_token_perplexity,
        'exact_match': int(exact_match),
        'word_f1_score': word_f1,
        'relevant_docs': doc_contents,
        'similarity_scores': similarity_scores.tolist()
    }

# Main execution
if __name__ == "__main__":
    rag_system = RAGSystem(
        embedding_model_name=EMBEDDING_MODEL_NAME,
        model_id=MODEL_ID,
        reranker_model=RERANKER_MODEL,
    )

    pdf_path = "test_one.pdf"  # Replace with your PDF file path if different
    sample = extract_data_from_pdf(pdf_path)

    if sample:
        result = evaluate_rag_system(rag_system, sample, pdf_path)

        if result:
            print("\nEvaluation Results:")
            for key, value in result.items():
                if key not in ['relevant_docs', 'similarity_scores']:
                    print(f"{key}: {value}")
        else:
            print("Evaluation failed.")
    else:
        print("Failed to extract data from the PDF.")

    rag_system.clear_memory()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['REMOVED_SECRET.bias', 'REMOVED_SECRET.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Splitting Documents: 100%|██████████| 1/1 [00:00<00:00, 330.86it/s]



Question: What is the capital of France?
Ground Truth: Paris!
Original Context: What is the capital of France? Paris! 
...
=> Retrieving documents...
Retrieved 1 documents.
Using 1 documents for answering.
=> Generating answer...
Generated Answer: Paris

        Context: Document 2:
During the late 19th century, the Industrial Revolution
Retrieved Documents:
Doc 1: What is the capital of France? Paris!...
Logits shape: torch.Size([1, 118, 32064]), Input IDs shape: torch.Size([1, 33])
Logits min: -23.75, max: 67.1875
Input IDs min: 13, max: 29973
Single-token Perplexity: 3154604143.8474
Tokenizer check:
Question tokens: [1724, 338, 278, 7483, 310, 3444, 29973]
Answer tokens: [3681, 13, 13, 4706, 15228, 29901, 10854, 29871, 29906, 29901, 13, 29928, 3864, 278, 5683, 29871, 29896, 29929, 386, 6462, 29892, 278, 12157, 9315, 14595]
Tokenized question: ['▁What', '▁is', '▁the', '▁capital', '▁of', '▁France', '?']
Tokenized answer: ['▁Paris', '<0x0A>', '<0x0A>', '▁▁▁▁▁▁▁', '▁Context', ':', '▁Do