In [5]:
# Install Required Packages
import sys
# Install all required packages - ALL FREE
!{sys.executable} -m pip install --quiet langchain faiss-cpu sentence-transformers transformers torch pydantic
!pip install -U langchain-community

print("✅ All packages installed successfully!")

Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [6]:
# Upload  Document
from google.colab import files

# Upload your document file (boston.txt)
print("📁 Please upload your .txt file:")
uploaded = files.upload()

# Create docs directory and move uploaded files
!mkdir -p docs
for fn in uploaded.keys():
    !mv "{fn}" docs/

# Confirm files are in place
print("\n📋 Files in docs directory:")
!ls -lh docs

📁 Please upload your .txt file:


Saving boston_guide.txt to boston_guide.txt

📋 Files in docs directory:
total 4.0K
-rw-r--r-- 1 root root 2.4K May 28 18:54 boston_guide.txt


In [7]:
# Import Libraries and Setup
from pathlib import Path
from typing import List
import json
import re
from pydantic import BaseModel, Field

# LangChain imports - the core components for RAG
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
import transformers



In [8]:
# Define Output Schema (Pydantic Model)
# Creates the structured output format using Pydantic for OUTPUT PARSING
# REQUIREMENT: OUTPUT PARSING & FORMATTING
# Define the JSON output schema using Pydantic
class QAResponse(BaseModel):
    answer: str = Field(..., description="The answer to the question")
    sources: List[str] = Field(..., description="Source document names")
    confidence: str = Field(..., description="Confidence level")
    word_count: int = Field(..., description="Number of words in answer")

# Create Pydantic parser
parser = PydanticOutputParser(pydantic_object=QAResponse)
format_instructions = parser.get_format_instructions()

print("✅ Pydantic Model: QAResponse schema defined")
print("✅ Output Parser: PydanticOutputParser created")
print(f"📋 Format Instructions Preview:\n{format_instructions[:200]}...")

✅ Pydantic Model: QAResponse schema defined
✅ Output Parser: PydanticOutputParser created
📋 Format Instructions Preview:
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "ty...


In [9]:
# Input Parsing Functions
# Demonstrates INPUT PARSING with validation and cleaning functions
def validate_and_clean_input(question: str) -> str:
    """Demonstrates INPUT PARSING and validation"""
    print(f"📥 Raw Input: '{question}'")

    # Input validation
    if not question or not question.strip():
        raise ValueError("Empty question not allowed")

    # Clean input - remove extra spaces
    cleaned = question.strip()
    cleaned = re.sub(r'\s+', ' ', cleaned)  # Remove extra spaces

    # Add question mark if needed for question words
    if not cleaned.endswith('?') and any(cleaned.lower().startswith(w) for w in ['what', 'how', 'when', 'where', 'why', 'who']):
        cleaned += '?'

    print(f"✅ Cleaned Input: '{cleaned}'")
    return cleaned

def parse_and_format_output(raw_answer: str, sources: List[str]) -> QAResponse:
    """Demonstrates OUTPUT PARSING - converting raw text to structured format"""
    print(f"🔄 Processing Raw Output: '{raw_answer[:50]}...'")

    # Clean the output
    if not raw_answer or len(raw_answer.strip()) < 3:
        raw_answer = "No sufficient information found."

    cleaned_answer = raw_answer.strip()
    word_count = len(cleaned_answer.split())

    # Determine confidence based on answer length
    if word_count > 20:
        confidence = "high"
    elif word_count > 10:
        confidence = "medium"
    else:
        confidence = "low"

    # Create structured response using Pydantic
    structured_response = QAResponse(
        answer=cleaned_answer,
        sources=sources,
        confidence=confidence,
        word_count=word_count
    )

    print("✅ Structured Output Created using Pydantic")
    return structured_response

print("✅ Input Parser: validate_and_clean_input()")
print("✅ Output Parser: parse_and_format_output()")

✅ Input Parser: validate_and_clean_input()
✅ Output Parser: parse_and_format_output()


In [10]:
# Document Loading and Chunking
# What this does: Implements the first part of RAG - loading and splitting documents
print("🔄 Starting Document Processing...")

# Load all .txt files from docs directory
loader = DirectoryLoader(
    "./docs",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
)
docs = loader.load()
print(f"✅ Document Loading: {len(docs)} documents loaded")

# Split documents into chunks for better retrieval
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=300,
    chunk_overlap=30,
    length_function=len,
)
chunks = splitter.split_documents(docs)
print(f"✅ Text Chunking: {len(chunks)} chunks created")

# Preview first few chunks
print(f"\n📄 Sample chunks:")
for i, chunk in enumerate(chunks[:2]):
    print(f"Chunk {i+1}: {chunk.page_content[:100]}...")

🔄 Starting Document Processing...
✅ Document Loading: 1 documents loaded
✅ Text Chunking: 10 chunks created

📄 Sample chunks:
Chunk 1: Boston: A Brief History & Visitor’s Guide

Boston, founded in 1630 by Puritan colonists from England...
Chunk 2: Key milestones:
- 1630: Settled by John Winthrop’s “Great Migration” fleet.
- 1692: Boston shaken by...


In [11]:
# Create Embeddings and Vector Store
# What this does: Creates embeddings and builds the searchable vector database (FAISS)
print("🔄 Creating Vector Store...")

# Create embeddings using FREE sentence transformer model
embedder = SentenceTransformerEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"  # FREE model
)
print("✅ Embeddings: Using FREE sentence-transformers model")

# Create FAISS vector store (runs locally, no API needed)
vectorstore = FAISS.from_documents(chunks, embedder)
print("✅ Vector Store: FAISS index created locally (FREE)")

# Test retrieval
test_query = "Boston"
retrieved_docs = vectorstore.similarity_search(test_query, k=2)
print(f"\n🔍 Test Retrieval for '{test_query}':")
for i, doc in enumerate(retrieved_docs):
    print(f"Document {i+1}: {doc.page_content[:100]}...")

🔄 Creating Vector Store...


  embedder = SentenceTransformerEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embeddings: Using FREE sentence-transformers model
✅ Vector Store: FAISS index created locally (FREE)

🔍 Test Retrieval for 'Boston':
Document 1: Boston: A Brief History & Visitor’s Guide

Boston, founded in 1630 by Puritan colonists from England...
Document 2: Over nearly four centuries, Boston has grown from a small Puritan outpost into a global hub of educa...


In [12]:
# Setup Language Model
# What this does: Configures the FREE language model for text generation

print("🔄 Setting up Language Model...")

# Setup FREE Hugging Face language model
hf_pipe = transformers.pipeline(
    "text2text-generation",
    model="google/flan-t5-small",  # FREE model
    device=-1,  # CPU only (FREE)
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=0,
    eos_token_id=1,
)
llm = HuggingFacePipeline(pipeline=hf_pipe)
print("✅ LLM: Using FREE google/flan-t5-small model")

🔄 Setting up Language Model...


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu


✅ LLM: Using FREE google/flan-t5-small model


  llm = HuggingFacePipeline(pipeline=hf_pipe)


In [13]:
# Create Prompt Templates
# What this does: Sets up prompt templates for basic and structured output

print("🔄 Creating Prompt Templates...")

# Simple prompt for reliable text generation
basic_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "Answer the question using the provided context.\n\n"
        "Context: {context}\n\n"
        "Question: {question}\n\n"
        "Answer:"
    )
)

# Structured prompt for JSON output (demonstrates format instructions)
structured_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "Answer the question using the context. Format your response as JSON.\n\n"
        "Context: {context}\n\n"
        "Question: {question}\n\n"
        "Required JSON format:\n"
        "{format_instructions}\n\n"
        "JSON:"
    ),
    partial_variables={"format_instructions": format_instructions}
)

print("✅ Basic Prompt: Simple template for reliable answers")
print("✅ Structured Prompt: JSON template with format instructions")

🔄 Creating Prompt Templates...
✅ Basic Prompt: Simple template for reliable answers
✅ Structured Prompt: JSON template with format instructions


In [14]:
# Create RAG Chains
# What this does: Builds the complete RAG pipeline chains

print("🔄 Building RAG Chains...")

# Basic RAG chain for reliable text generation
basic_qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": basic_prompt},
)

# Structured RAG chain for JSON output attempts
structured_qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": structured_prompt},
)

print("✅ Basic RAG Chain: Reliable text generation pipeline")
print("✅ Structured RAG Chain: JSON-focused pipeline")
print("✅ Retrieval: Top-2 document retrieval configured")

🔄 Building RAG Chains...
✅ Basic RAG Chain: Reliable text generation pipeline
✅ Structured RAG Chain: JSON-focused pipeline
✅ Retrieval: Top-2 document retrieval configured


In [15]:
# Complete RAG Function
# What this does: The main function demonstrating all requirements together

def run_complete_rag_demo(question: str) -> QAResponse:
    """
    Complete RAG pipeline demonstrating:
    1. Input parsing and validation
    2. RAG retrieval and generation
    3. Output parsing with Pydantic
    4. Multiple output formats
    """

    print(f"🚀 COMPLETE RAG DEMONSTRATION: '{question}'")
    print("=" * 60)

    # STEP 1: INPUT PARSING
    print("STEP 1: INPUT PARSING & VALIDATION")
    try:
        cleaned_question = validate_and_clean_input(question)
    except ValueError as e:
        print(f"❌ Input validation failed: {e}")
        raise

    # STEP 2: RAG PROCESSING
    print("\nSTEP 2: RAG RETRIEVAL & GENERATION")
    inputs = {"query": cleaned_question}

    # Get basic answer first
    result = basic_qa_chain.invoke(inputs)
    raw_answer = result["result"].strip()
    source_docs = result["source_documents"]
    source_names = list(set([Path(d.metadata["source"]).name for d in source_docs]))

    print(f"✅ Retrieved {len(source_docs)} documents")
    print(f"✅ Generated Answer: '{raw_answer[:100]}...'")

    # STEP 3: TRY PYDANTIC PARSING (if possible)
    print("\nSTEP 3: OUTPUT PARSING ATTEMPTS")

    # Try structured JSON output first
    if raw_answer and len(raw_answer) > 5:
        try:
            print("🔄 Attempting structured JSON generation...")
            structured_result = structured_qa_chain.invoke(inputs)
            json_output = structured_result["result"].strip()

            if json_output:
                print(f"JSON Attempt: '{json_output[:100]}...'")
                # Try to parse with Pydantic
                parsed_response = parser.parse(json_output)
                print("✅ SUCCESS: Pydantic JSON parsing worked!")
                structured_response = parsed_response
            else:
                raise Exception("Empty JSON output")

        except Exception as e:
            print(f"⚠️ Structured parsing failed: {e}")
            print("🔧 Using manual structured output creation...")
            structured_response = parse_and_format_output(raw_answer, source_names)
    else:
        print("🔧 Using fallback structured output...")
        structured_response = parse_and_format_output(raw_answer, source_names)

    # STEP 4: OUTPUT FORMATTING
    print("\nSTEP 4: MULTIPLE OUTPUT FORMATS")

    # Format 1: JSON
    print("📄 JSON Format:")
    print(json.dumps(structured_response.model_dump(), indent=2))

    # Format 2: Human readable
    print("\n👤 Human-Readable Format:")
    print(f"Answer: {structured_response.answer}")
    print(f"Sources: {', '.join(structured_response.sources)}")
    print(f"Confidence: {structured_response.confidence}")
    print(f"Word Count: {structured_response.word_count}")

    return structured_response

print("✅ Complete RAG function ready!")

✅ Complete RAG function ready!


In [16]:
# Test the Complete System
# What this does: Runs comprehensive tests showing all assignment requirements

print("🎯 TESTING ALL ASSIGNMENT REQUIREMENTS")
print("=" * 70)

test_questions = [
    "How are summers in Boston?",
    "  What is Boston known for?  ",  # Extra whitespace test
    "Boston history",  # Short question
    "",  # Empty input (error handling)
]

successful_tests = 0
total_tests = len(test_questions)

for i, test_q in enumerate(test_questions, 1):
    print(f"\n🧪 TEST {i}/{total_tests}: '{test_q}'")
    print("-" * 50)

    try:
        result = run_complete_rag_demo(test_q)
        print(f"✅ SUCCESS: Generated structured {type(result).__name__}")
        successful_tests += 1
    except Exception as e:
        print(f"❌ ERROR: {e}")

    print("=" * 70)

print(f"\n📊 RESULTS: {successful_tests}/{total_tests} tests passed")

🎯 TESTING ALL ASSIGNMENT REQUIREMENTS

🧪 TEST 1/4: 'How are summers in Boston?'
--------------------------------------------------
🚀 COMPLETE RAG DEMONSTRATION: 'How are summers in Boston?'
STEP 1: INPUT PARSING & VALIDATION
📥 Raw Input: 'How are summers in Boston?'
✅ Cleaned Input: 'How are summers in Boston?'

STEP 2: RAG RETRIEVAL & GENERATION


Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


✅ Retrieved 2 documents
✅ Generated Answer: 'Over nearly four centuries, Boston has grown from a small Puritan outpost into a global hub of educa...'

STEP 3: OUTPUT PARSING ATTEMPTS
🔄 Attempting structured JSON generation...
⚠️ Structured parsing failed: Empty JSON output
🔧 Using manual structured output creation...
🔄 Processing Raw Output: 'Over nearly four centuries, Boston has grown from ...'
✅ Structured Output Created using Pydantic

STEP 4: MULTIPLE OUTPUT FORMATS
📄 JSON Format:
{
  "answer": "Over nearly four centuries, Boston has grown from a small Puritan outpost into a global hub of education, healthcare, finance, and innovation. ------",
  "sources": [
    "boston_guide.txt"
  ],
  "confidence": "high",
  "word_count": 23
}

👤 Human-Readable Format:
Answer: Over nearly four centuries, Boston has grown from a small Puritan outpost into a global hub of education, healthcare, finance, and innovation. ------
Sources: boston_guide.txt
Confidence: high
Word Count: 23
✅ SUCCESS: G