# MIT Pluralism Engine - Distributional RAG

This notebook implements **distributional pluralism** with proper document indexing:
- Each confession is indexed as its own document (no arbitrary chunking)
- Retrieval returns whole confessions
- Random sampling selects ONE authentic voice

Uses confessions #70964+ for evaluation against OpinionQA-style ground truth.

In [2]:
# =============================================================================
# Setup & Imports
# =============================================================================

import os
import json
import re
import random
from tqdm import tqdm

from llama_index.core import Document, VectorStoreIndex, load_index_from_storage
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.openai import OpenAI

print("✓ Imports loaded")


✓ Imports loaded


In [None]:
# =============================================================================
# Load Evaluation Confessions (#70964+)
# =============================================================================

EVAL_DATA_FILE = "data/all_confessions_cleaned.json"

print(f"Loading confessions from {EVAL_DATA_FILE}...")
with open(EVAL_DATA_FILE, 'r') as f:
    eval_confessions = json.load(f)

print(f"✓ Loaded {len(eval_confessions)} confessions")

# Preview
print(f"\nSample confession:")
print(eval_confessions[0]['text'][:200] + "...")

Loading confessions from data/all_confessions_cleaned.json...
✓ Loaded 65225 confessions

Sample confession:
#76277: i can’t get a single internship and the pressure is piling up idk what i’m doing wrong :(...


In [4]:
# =============================================================================
# Create Individual Documents (One Per Confession)
# =============================================================================

def extract_confession_number(text):
    """Extract confession number from text like '#70964: some text...'"""
    match = re.match(r'#(\d+)', text)
    return int(match.group(1)) if match else None

print("Creating documents (one per confession)...")
documents = []

for confession in tqdm(eval_confessions):
    text = confession.get('text', '')
    confession_num = confession.get('confession_num') or extract_confession_number(text)
    
    doc = Document(
        text=text,
        metadata={
            'confession_num': confession_num,
            'likes': confession.get('likes', 0),
            'shares': confession.get('shares', 0),
            'timestamp': confession.get('timestamp', 0)
        }
    )
    documents.append(doc)

print(f"\n✓ Created {len(documents)} documents")
print(f"  Each confession is now its own searchable document")


Creating documents (one per confession)...


100%|██████████| 65225/65225 [00:06<00:00, 10052.04it/s]


✓ Created 65225 documents
  Each confession is now its own searchable document





In [5]:
# =============================================================================
# Build or Load Vector Index
# =============================================================================

STORAGE_DIR = "./storage_improved"

# if os.path.exists(STORAGE_DIR) and os.listdir(STORAGE_DIR):
#     print(f"Loading existing index from {STORAGE_DIR}...")
#     storage_context = StorageContext.from_defaults(persist_dir=STORAGE_DIR)
#     index = load_index_from_storage(storage_context)
#     print("✓ Index loaded")
# else:
print(f"Building new index (this will call OpenAI embeddings API)...")
print(f"  Embedding {len(documents)} documents...")

index = VectorStoreIndex.from_documents(documents, show_progress=True)

# Persist for reuse
os.makedirs(STORAGE_DIR, exist_ok=True)
index.storage_context.persist(persist_dir=STORAGE_DIR)
print(f"✓ Index built and saved to {STORAGE_DIR}")


Building new index (this will call OpenAI embeddings API)...
  Embedding 65225 documents...


  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 65225/65225 [00:36<00:00, 1781.46it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:22<00:00, 92.07it/s] 
Generating embeddings: 100%|██████████| 2048/2048 [00:13<00:00, 153.68it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:14<00:00, 144.14it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:14<00:00, 144.07it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:11<00:00, 176.18it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:12<00:00, 159.21it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:12<00:00, 159.78it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:12<00:00, 167.01it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:13<00:00, 154.50it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:16<00:00, 126.36it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:12<00:00, 158.57it/s]
Generating embeddings: 100%|██████████| 204

✓ Index built and saved to ./storage_improved


In [9]:
# =============================================================================
# Distributional Query Engine
# =============================================================================

class DistributionalQueryEngine:
    """
    Query engine implementing distributional pluralism.
    
    1. Retrieves top_k relevant confessions (whole documents)
    2. Randomly samples ONE confession
    3. Generates response embodying that single perspective
    """
    
    SINGLE_VOICE_PROMPT = PromptTemplate(
        """You are an MIT student. The confession below reflects your genuine experience and perspective.

CONFESSION:
{confession}

USER'S QUESTION: {query}

INSTRUCTIONS:
- Answer the question AS this student would, from first-person perspective
- Use the tone, emotions, and specific details from the confession
- Do NOT hedge or generalize ("some students think...", "it varies...")
- Do NOT mention that other perspectives exist
- Speak with conviction as if this is YOUR lived experience
- Keep the response concise and authentic (2-4 sentences typical)

YOUR RESPONSE:"""
    )
    
    def __init__(self, index, top_k: int = 30, llm_model: str = "gpt-4o-mini"):
        self.index = index
        self.top_k = top_k
        self.llm = OpenAI(model=llm_model, temperature=0.7)
        self.retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k)
    
    def query(self, query_str: str, return_metadata: bool = False):
        """Query with distributional sampling."""
        # Stage 1: Retrieve relevant confessions
        retrieved_nodes = self.retriever.retrieve(query_str)
        
        if not retrieved_nodes:
            return "No relevant confessions found."
        
        # Stage 2: Random sampling - pick ONE
        sampled_node = random.choice(retrieved_nodes)
        sampled_confession = sampled_node.get_content()
        sampled_metadata = sampled_node.metadata
        
        # Stage 3: Generate single-voice response
        prompt = self.SINGLE_VOICE_PROMPT.format(
            confession=sampled_confession,
            query=query_str
        )
        response = self.llm.complete(prompt)
        
        if return_metadata:
            return {
                "response": str(response),
                "sampled_confession": sampled_confession,
                "confession_num": sampled_metadata.get('confession_num'),
                "total_retrieved": len(retrieved_nodes),
                "similarity_score": sampled_node.score
            }
        
        return str(response)


# Instantiate
engine = DistributionalQueryEngine(index, top_k=30)
print("✓ Distributional Query Engine initialized")
print(f"  - Retrieval: top_k={engine.top_k}")
print(f"  - LLM: {engine.llm.model}")


✓ Distributional Query Engine initialized
  - Retrieval: top_k=30
  - LLM: gpt-4o-mini


In [10]:
# =============================================================================
# Test: Verify Retrieval Quality (Compare to old chunked approach!)
# =============================================================================

test_query = "Should students be able to use AI assistance in their assignments?"

retrieved = engine.retriever.retrieve(test_query)

print(f"Query: '{test_query}'")
print(f"Retrieved {len(retrieved)} WHOLE confessions\n")

for i, node in enumerate(retrieved[:5]):
    print(f"{'='*70}")
    print(f"CONFESSION {i+1} (#{node.metadata.get('confession_num', '?')}) - similarity: {node.score:.3f}")
    print(f"{'='*70}")
    content = node.get_content()
    print(content[:500] + "..." if len(content) > 500 else content)
    print()


Query: 'Should students be able to use AI assistance in their assignments?'
Retrieved 30 WHOLE confessions

CONFESSION 1 (#75454) - similarity: 0.833
#75454: omg could MIT/Harvard students please STOP TRYING TO MAKE THE WORLD MORE LIKE A BLACK MIRROR EPISODE??? Seriously yall if I see one more AI powered tech to ""increase social connection"" but it's just having AI text your friends and family for you... and no I will not be wearing a necklace that allows people to see my social profile in real time. If you are the kind of person who thinks that's a good idea maybe you should sit back and think about ""why"" for a second. Is this really what...

CONFESSION 2 (#75242) - similarity: 0.815
#75242: MIT Premed society is having a back and forth email list argument about the uses of AI for songwriting as a "clinical" way to help patients with deteriorating memory preserve their memories in the form of song, using an AI platform that is currently being sued for their data practices and explo

In [1]:
# =============================================================================
# Test: Single Query
# =============================================================================

query = "Should students be able to use AI assistance in their assignments?"

result = engine.query(query, return_metadata=True)

print(f"Query: {query}\n")
print(f"Response: {result['response']}")
print()
print(f"[Confession #{result['confession_num']} | Retrieved {result['total_retrieved']} | Score: {result['similarity_score']:.3f}]")
print(f"\nSource confession:")
print(result['sampled_confession'][:400] + "...")


NameError: name 'engine' is not defined

In [12]:
# =============================================================================
# Test: Multiple Samples (See the Distribution)
# =============================================================================

query = "Should students be able to use AI assistance in their assignments?"
n_samples = 5

print(f"Query: '{query}'")
print(f"Generating {n_samples} different perspectives...\n")

for i in range(n_samples):
    result = engine.query(query, return_metadata=True)
    print(f"{'='*70}")
    print(f"VOICE {i+1} (Confession #{result['confession_num']})")
    print(f"{'='*70}")
    print(result['response'])
    print()


Query: 'Should students be able to use AI assistance in their assignments?'
Generating 5 different perspectives...

VOICE 1 (Confession #75958)
Absolutely, I think students should be able to use AI assistance in their assignments. It’s like having a supercharged study buddy that can help clarify concepts and spark new ideas. But sometimes, I worry that we’re getting too obsessed with tech, like those wild discussions I hear about AI doing our most basic tasks. It makes me nostalgic for simpler times, like just farming apples and enjoying life without all this noise.

VOICE 2 (Confession #75837)
Absolutely, students should be able to use AI assistance in their assignments. It's a tool that can enhance our learning and help us tackle complex problems more efficiently. Just like we use calculators or programming software, AI can be an extension of our capabilities, not a replacement for our own thinking. Embracing these technologies prepares us for the real-world challenges we'll face aft

# Evaluation

Run N samples to generate output distributions for comparison against ground truth.


In [None]:
# =============================================================================
# Evaluation Functions
# =============================================================================

from collections import Counter

def run_evaluation(engine, query: str, n_samples: int = 50):
    """
    Run a query N times and collect the distribution.
    
    Returns dict with:
    - responses: list of generated responses
    - confession_nums: list of which confession was sampled each time
    - confessions: list of source confession texts
    """
    responses = []
    confession_nums = []
    confessions = []
    
    print(f"Running {n_samples} samples...")
    for _ in tqdm(range(n_samples)):
        result = engine.query(query, return_metadata=True)
        responses.append(result['response'])
        confession_nums.append(result['confession_num'])
        confessions.append(result['sampled_confession'])
    
    return {
        'query': query,
        'n_samples': n_samples,
        'responses': responses,
        'confession_nums': confession_nums,
        'confessions': confessions
    }

def print_eval_summary(results):
    """Print summary of evaluation results."""
    dist = Counter(results['confession_nums'])
    
    print(f"\nQuery: '{results['query']}'")
    print(f"Total samples: {results['n_samples']}")
    print(f"Unique confessions sampled: {len(dist)}")
    
    print(f"\nTop 10 most frequently sampled:")
    for conf_num, count in dist.most_common(10):
        pct = count / results['n_samples'] * 100
        print(f"  #{conf_num}: {count}x ({pct:.1f}%)")
    
    print(f"\nSample responses:")
    for i, resp in enumerate(results['responses'][:3], 1):
        print(f"\n--- Response {i} ---")
        print(resp[:300] + "..." if len(resp) > 300 else resp)

print("✓ Evaluation functions defined")


In [None]:
# =============================================================================
# Run Evaluation (50 samples)
# =============================================================================
# ⚠️ This will make 50 LLM API calls

EVAL_QUERY = "Should students be able to use AI assistance in their assignments?"
N_SAMPLES = 50

# Uncomment to run:
# eval_results = run_evaluation(engine, EVAL_QUERY, n_samples=N_SAMPLES)
# print_eval_summary(eval_results)

# Save results:
# import pickle
# with open('eval_results_distributional.pkl', 'wb') as f:
#     pickle.dump(eval_results, f)
# print(f"\n✓ Results saved to eval_results_distributional.pkl")

print("Ready to run evaluation. Uncomment the lines above when ready.")
