## Write your name etc.

## CSC 380 HW\#4: LLM and RAG

Fall 2025 HW\#4 starter code

In [None]:
# -*- coding: utf-8 -*-
# Setup
# ----------------------------
!pip install -qU requests==2.32.4 chromadb langchain langchain-chroma langchain-huggingface langchain_openai langchain_community sentence-transformers tiktoken openai pypdf

In [None]:
# Imports
# ----------------------------
import os, re, shutil
import requests
import chromadb
from typing import List, Dict, Tuple
from IPython.display import Markdown, display

# LangChain imports
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI, OpenAI
from langchain.prompts import PromptTemplate

In [None]:
# Load API keys from CoLab secret keys
from google.colab import userdata

langchain_api = userdata.get('langchain_api')
openai_api = userdata.get('openai_api')
hf_token = userdata.get('HF_TOKEN')

os.environ["OPENAI_API_KEY"] = openai_api
os.environ["LANGCHAIN_API_KEY"] = langchain_api
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token

## Part 0: Define Core Functions

In [None]:
# 1-1: Load and preprocess documents
# ----------------------------
def load_documents() -> List[Document]:
    """
    Load the four AI policy documents from their URLs.
    Returns a list of Document objects.
    """
    # (1) TODO: Implement this function
    # Load all four documents using appropriate LangChain document loaders
    # Return a list of Document objects
    url = "https://raw.githubusercontent.com/ntomuro/CSC380/main/LLM_RAG/data"
    document_filenames = [
        "Human-Nutrition-2020-Edition-1598491699.txt",
        "dci190009_pdf.txt",
        "dci190014_pdf.txt"
    ]
    documents = []
    ## Continue the TODO and collect raw documents in a variable 'documents'
    ## and return it.  You'll use langchain's PyPDFLoader, TextLoader to do.






    return documents

def preprocess_documents(documents: List[Document]) -> List[Document]:
    """
    Split documents into chunks for vector storage.
    """
    # (2) TODO: Implement text splitting
    # Use RecursiveCharacterTextSplitter with appropriate parameters
    # Consider what chunk_size and chunk_overlap work best for multi-document retrieval
    # Return list of document chunks







    # for now; change to something appropritate based on your code
    return []    return retriever

# 1-2: Vector Store Population
# ----------------------------
def create_vector_store(documents: List[Document], embedding_model_name: str, client):
    """
    Create and return a retriever from documents using the specified embedding model.
    """
    # (3) TODO: Implement vector store creation
    # Initialize embedding function with the specified model
    # Create Chroma vector store from documents
    # Return a retriever with search_kwargs={"k": 3}





    return retriever

# 1-3: Query & Evaluation Set
# ----------------------------
def create_test_queries() -> List[str]:
    """
    Create a diverse set of 6 test queries as specified in the assignment.
    """
    # (4) TODO: Implement this function to return 6 well-designed test queries
    # You can hard-code your own queries, or automatically generate queries in some way.
    queries = [






    ]
    return queries

# 1-4: The RAG Chain
# ----------------------------

def run_rag_query(query: str, retriever) -> Tuple[str, List[Document]]:
    """
    Execute a RAG query: retrieve relevant context and generate an answer.
    Returns the generated answer and the retrieved documents for evaluation.
    """
    # (5) TODO: Implement the RAG query execution
    # Retrieve relevant chunks using the retriever
    # Construct a high-quality prompt that includes context and query
    # Use an LLM chain ("gpt-3.5-turbo-instruct", with 'temperature=0') to
    # generate the answer.  Assign the answer and retrieved documents to
    # variables 'answer' and 'retrieved_docs', and return them.










    return answer, retrieved_docs

# 1-5: Evaluation
# ----------------------------
def evaluate_results(query: str, retrieved_docs: List[Document], answer: str) -> Tuple[int, int]:
    """
    Manually evaluate the retrieval quality and answer quality.
    Returns (retrieval_score, answer_score) on scale 1-3.
    """
    # (6) TODO: Manually evaluate and return scores
    # Code is written.  You only enter your manual evaluation scores during execution.
    print(f"Query: {query}")
    print(f"Retrieved {len(retrieved_docs)} documents")
    print(f"Answer: {answer}")
    print("---")

    retrieval_score = int(input("Retrieval score (1-3): "))
    answer_score = int(input("Answer score (1-3): "))

    return retrieval_score, answer_score

## **Experiment 1**: Sentence Transformers

#### This cell is mostly complete. You can make slight modifications to do more experiments, but do not make large changes.



In [None]:
# 2: The Main Experiment 1
# ----------------------------
def main():
    """
    Main function to run the complete RAG experiment.
    """
    def safe_name(s):
        """
        Convert a string to a valid file name by removing invalid characters
        and using underscores to separate words.
        """
        return re.sub(r'[^0-9A-Za-z._-]', '_', s)

    # Define embedding models to test
    embedding_models = [
        "all-MiniLM-L6-v2",  # 384
        "multi-qa-MiniLM-L6-cos-v1" # 384
    ]

    # Load and preprocess documents
    print("Loading documents...")
    raw_documents = load_documents()
    print(f"Loaded {len(raw_documents)} raw documents")

    print("Preprocessing documents...")
    processed_documents = preprocess_documents(raw_documents)
    print(f"Created {len(processed_documents)} document chunks")

    # Create test queries
    test_queries = create_test_queries()
    print(f"Created {len(test_queries)} test queries")

    # Store results for analysis
    results = {}

    # Run experiment for each embedding model
    for model_name in embedding_models:
        print(f"\n{'='*50}")
        print(f"Testing model: {model_name}")
        print(f"{'='*50}")

        # Create a unique DB for the embedding model
        model_key = safe_name(model_name)
        db_path = f"./chroma_db_{model_key}"   # unique per model

        chroma_client = chromadb.PersistentClient(path=db_path)

        # Create vector store with current model
        retriever = create_vector_store(processed_documents, model_name, chroma_client)

        model_results = {
            'retrieval_scores': [],
            'answer_scores': [],
            'details': []
        }

        # Test each query
        for i, query in enumerate(test_queries): ####
            print(f"\nQuery {i+1}: {query}")

            # Run RAG query
            answer, retrieved_docs = run_rag_query(query, retriever)

            # Evaluate results
            retrieval_score, answer_score = evaluate_results(query, retrieved_docs, answer)

            # Store scores
            model_results['retrieval_scores'].append(retrieval_score)
            model_results['answer_scores'].append(answer_score)
            model_results['details'].append({
                'query': query,
                'answer': answer,
                'retrieved_docs': retrieved_docs
            })

        # Calculate averages
        model_results['avg_retrieval'] = sum(model_results['retrieval_scores']) / len(model_results['retrieval_scores'])
        model_results['avg_answer'] = sum(model_results['answer_scores']) / len(model_results['answer_scores'])

        results[model_name] = model_results

        print(f"\nModel {model_name} - Avg Retrieval: {model_results['avg_retrieval']:.2f}, Avg Answer: {model_results['avg_answer']:.2f}")

    return results, processed_documents, test_queries

### Experiment 1 (Run Experiment)

In [None]:
# Execute the experiment
final_results, documents, queries = main()
print (final_results)

Loading documents...
Attempting to download https://raw.githubusercontent.com/ntomuro/CSC380/main/LLM_RAG/data/Human-Nutrition-2020-Edition-1598491699.txt to ./temp_docs/Human-Nutrition-2020-Edition-1598491699.txt...
Successfully downloaded https://raw.githubusercontent.com/ntomuro/CSC380/main/LLM_RAG/data/Human-Nutrition-2020-Edition-1598491699.txt
Successfully loaded ./temp_docs/Human-Nutrition-2020-Edition-1598491699.txt with LangChain loader
Attempting to download https://raw.githubusercontent.com/ntomuro/CSC380/main/LLM_RAG/data/dci190009_pdf.txt to ./temp_docs/dci190009_pdf.txt...
Successfully downloaded https://raw.githubusercontent.com/ntomuro/CSC380/main/LLM_RAG/data/dci190009_pdf.txt
Successfully loaded ./temp_docs/dci190009_pdf.txt with LangChain loader
Attempting to download https://raw.githubusercontent.com/ntomuro/CSC380/main/LLM_RAG/data/dci190014_pdf.txt to ./temp_docs/dci190014_pdf.txt...
Successfully downloaded https://raw.githubusercontent.com/ntomuro/CSC380/main/LLM

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Query 1: What is the pros and cons of taking in lots of protein if you are prediabetic?
Query: What is the pros and cons of taking in lots of protein if you are prediabetic?
Retrieved 3 documents
Answer: 
I'm sorry, I don't have enough information to answer that question. The context provided only mentions limited research on the impact of protein consumption on diabetes-related outcomes, but does not specifically address the pros and cons of high protein intake for prediabetic individuals. It would be best to consult a healthcare professional for personalized advice on this matter.
---
Retrieval score (1-3): 2
Answer score (1-3): 1

Query 2: What is the future vision for healthcare in the US? I'm concerned because I am 50 years old and my father was diabetic.
Query: What is the future vision for healthcare in the US? I'm concerned because I am 50 years old and my father was diabetic.
Retrieved 3 documents
Answer: 
I'm sorry, I don't have enough information to answer your question abo

### Experiment 1 (Print Results) -- Run this cell **after you finish writing your code**. Report the results table in your write-up!



In [None]:
# Analysis Helper Functions
# ----------------------------

def print_results_table(results: Dict):
    """Print a formatted results table for the report."""
    print("\n" + "="*60)
    print("EXPERIMENT RESULTS SUMMARY")
    print("="*60)
    print(f"{'Model':<25} {'Avg Retrieval':<15} {'Avg Answer':<15}")
    print("-"*60)

    for model_name, model_results in results.items():
        print(f"{model_name:<25} {model_results['avg_retrieval']:<15.2f} {model_results['avg_answer']:<15.2f}")

def compare_retrieval_for_query(results: Dict, query_index: int):
    """Compare retrieval performance for a specific query across models."""
    print(f"\nRetrieval comparison for Query {query_index + 1}:")
    for model_name, model_results in results.items():
        retrieval_score = model_results['retrieval_scores'][query_index]
        answer_score = model_results['answer_scores'][query_index]
        print(f"  {model_name}: Retrieval={retrieval_score}, Answer={answer_score}")

# Print the final results
# ----------------------------
print_results_table(final_results)
for query in queries:
  compare_retrieval_for_query(final_results, queries.index(query))


EXPERIMENT RESULTS SUMMARY
Model                     Avg Retrieval   Avg Answer     
------------------------------------------------------------
all-MiniLM-L6-v2          1.50            1.75           
multi-qa-MiniLM-L6-cos-v1 2.25            1.25           

Retrieval comparison for Query 1:
  all-MiniLM-L6-v2: Retrieval=2, Answer=2
  multi-qa-MiniLM-L6-cos-v1: Retrieval=2, Answer=1

Retrieval comparison for Query 2:
  all-MiniLM-L6-v2: Retrieval=1, Answer=1
  multi-qa-MiniLM-L6-cos-v1: Retrieval=2, Answer=1

Retrieval comparison for Query 3:
  all-MiniLM-L6-v2: Retrieval=1, Answer=1
  multi-qa-MiniLM-L6-cos-v1: Retrieval=2, Answer=2

Retrieval comparison for Query 4:
  all-MiniLM-L6-v2: Retrieval=2, Answer=3
  multi-qa-MiniLM-L6-cos-v1: Retrieval=3, Answer=1


## **Experiment 2**: Temperature parameter

In [None]:
"""You run just answerable queries (three of them from above).
For each query, try one low, one high temperatures of your choice (thus three times), and 0.5."""

##
## IMPLEMENT YOUR OWN
##


## **Experiment 3**: LLM as Judge

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load Mistral 7B model from Hugging Face -- the Judge (independent from RAG-LLM)!
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

##
## YOU FILL THE REST
##
