There are two goals of this section. First to generate a comparison between a friendly chat bot and a formal bot. The second objective is to validate that the response generated by the LLM exists and does not hallucinate at all. 

In [1]:
import csv 
import uuid
import time
import random
import os
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

Create the logger to compare the models outputs

In [4]:
class ExperimentLogger:
    def __init__(self, filepath="ab_test_logs.csv"):
        self.filepath = filepath
        # Create headers if file doesn't exist
        if not os.path.exists(filepath):
            with open(filepath, mode='w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow([
                    "timestamp", "session_id", "variant", 
                    "user_query", "sentiment", "intent", 
                    "retrieved_context", "llm_response", 
                    "response_time_ms", "validation_score"
                ])
    def log(self, session_id, variant, query, sentiment, intent, context, response, latency, score):
            with open(self.filepath, mode='a', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow([
                    datetime.now(), session_id, variant, 
                    query, sentiment, intent, 
                    context, response, 
                    round(latency * 1000, 2), score
                ])

Create the quality validator of the prompt generated by the LLM

In [5]:
class QualityValidator:
    def __init__(self):
        # We reuse the RAG model to check if the Answer matches the Context
        print("Loading Validator Model...")
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def validate(self, llm_response, retrieved_context):
        """
        Calculates a 'Hallucination Score'. 
        If the LLM answer has nothing to do with the Policy, the score will be low.
        """
        # 1. Sanity Check: Is it empty or too short?
        if not llm_response or len(llm_response) < 10:
            return 0.0, "Response too short"

        # 2. Semantic Consistency Check
        # We turn both texts into numbers and measure the angle between them (Cosine Similarity)
        embeddings = self.model.encode([llm_response, retrieved_context])
        
        # Calculate similarity (Range: 0 to 1)
        score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        
        return float(score), "Valid"

Test the usability of the these two functions

In [7]:
logger = ExperimentLogger()
validator = QualityValidator()

def run_pipeline_step(user_query, context, sentiment, intent):
    session_id = str(uuid.uuid4())
    start_time = time.time()
    
    # --- A/B TEST LOGIC ---
    # Randomly assign user to Group A (Friendly) or Group B (Formal)
    variant = "A_Friendly" if random.random() > 0.5 else "B_Formal"
    
    # (In a real app, you would change the LLM prompt here based on the variant)
    # For this test, we simulate an LLM response
    if variant == "A_Friendly":
        mock_response = "Sure thing! Per our policy, refunds take 5-7 days."
    else:
        mock_response = "Refunds are processed within 5-7 business days according to policy."

    # --- VALIDATION STEP ---
    # Check if the answer actually matches the context
    score, reason = validator.validate(mock_response, context)
    
    # --- LOGGING STEP ---
    duration = time.time() - start_time
    logger.log(
        session_id=session_id,
        variant=variant,
        query=user_query,
        sentiment=sentiment,
        intent=intent,
        context=context,
        response=mock_response,
        latency=duration,
        score=score
    )

    return mock_response, score

Loading Validator Model...


Test the functionality

In [8]:
q = "When do I get my money?"
ctx = "Refund Policy: Refunds are processed in 5-7 business days."
sent = "neutral"
inte = "refund"

response, quality_score = run_pipeline_step(q, ctx, sent, inte)

print(f"\nFinal Response: {response}")
print(f"Quality Score: {quality_score:.4f} (1.0 is perfect match)")
print("Check 'ab_test_logs.csv' to see the data.")


Final Response: Sure thing! Per our policy, refunds take 5-7 days.
Quality Score: 0.7290 (1.0 is perfect match)
Check 'ab_test_logs.csv' to see the data.
