# Model Testing for Paper & CUDA RAG System

This notebook focuses on testing and evaluating our RAG system's response generation for both research paper understanding and CUDA implementation guidance.

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import pickle
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from typing import List, Dict

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


  from tqdm.autonotebook import tqdm, trange


In [None]:
## Load Existing Knowledge Base and Models
# Load embeddings and data
with open('cuda_embeddings.pkl', 'rb') as f:
    df, embeddings = pickle.load(f)

# Initialize models
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
llm = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

print(f"Loaded {len(df)} documents")
print(f"Embedding shape: {embeddings.shape}")

In [None]:
# Define Test Scenarios
#Create a set of test queries that cover both paper comprehension and CUDA implementation aspects.
def evaluate_response(query: str, model, tokenizer, llm, df, embeddings, top_k=3):
    """
    Test the full RAG pipeline with a single query
    """
    # Setup FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings).astype('float32'))
    
    # Get query embedding
    query_embedding = model.encode([query])[0]
    
    # Search
    distances, indices = index.search(np.array([query_embedding]).astype('float32'), top_k)
    
    # Get relevant chunks
    relevant_chunks = []
    for dist, idx in zip(distances[0], indices[0]):
        relevant_chunks.append({
            'text': df.iloc[idx]['text'],
            'source': df.iloc[idx]['source'],
            'similarity': 1 - dist/2
        })
    
    # Prepare prompt
    context = "\n\n".join([chunk['text'] for chunk in relevant_chunks])
    prompt = f"""Use the following information to answer the question.
    If you cannot answer based on the provided context, say so.
    
    Context:
    {context}

    Question: {query}

    Answer:"""
    
    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = llm.generate(**inputs, max_length=200, num_beams=4, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return {
        'query': query,
        'response': response,
        'relevant_chunks': relevant_chunks
    }

# Test queries
test_queries = [
    "Explain how CUDA thread blocks work",
    "What are the main memory considerations in CUDA?",
    "How can I optimize matrix multiplication in CUDA?",
]

# Run tests
for query in test_queries:
    print(f"\nTesting query: {query}")
    print("-" * 80)
    
    result = evaluate_response(query, model, tokenizer, llm, df, embeddings)
    
    print("\nResponse:")
    print(result['response'])
    
    print("\nRelevant chunks used:")
    for i, chunk in enumerate(result['relevant_chunks'], 1):
        print(f"\n{i}. Similarity: {chunk['similarity']:.3f}")
        print(f"Text: {chunk['text'][:200]}...")