# NUS GenAI Capstone Project

## Setup

In [1]:
# Import libraries

# === Core Python Libraries ===
import os
import replicate
import sqlite3
import requests
import uuid
import traceback
from IPython.display import display, Markdown
from datetime import datetime, timedelta

# === LangChain 1.0 - Agent Framework ===
from langchain.agents import create_agent
from langchain_core.tools import tool
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, AIMessage

# === LangGraph 1.0 - State Management & Checkpointing ===
from langgraph.checkpoint.memory import MemorySaver

# === Hybrid RAG & Reranking ===
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever, ContextualCompressionRetriever
from langchain_community.document_compressors import JinaRerank

In [2]:
# Retrieve API keys from environment variables
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
REPLICATE_API_TOKEN = os.environ.get("REPLICATE_API_TOKEN")
WEATHER_API_KEY = os.environ.get("WEATHER_API_KEY")
JINA_API_KEY = os.environ.get("JINA_API_KEY")

In [3]:
# Prepare Hybrid RAG system for Singapore venue policies
# Load venue policy documents for Marina Bay Sands, Gardens by the Bay, Esplanade, and SG regulations

venue_policy_files = [
    'MBS-Event-Policy.pdf',
    'GBTB-Venue-Guide.pdf',
    'Esplanade-Manual.pdf',
    'SG-Event-Regulations.pdf'
]

# Load all venue policy documents
all_documents = []
for filepath in venue_policy_files:
    loader = PyMuPDFLoader(filepath)
    docs = loader.load()
    if docs:
        all_documents.extend(docs)
        print(f"‚úÖ Loaded {len(docs)} pages from {filepath}")
    else:
        print(f"‚ö†Ô∏è No content extracted from {filepath}")

if not all_documents:
    raise ValueError("No documents loaded. Check PDF files.")

print(f"\nüìÑ Total pages loaded: {len(all_documents)}")

# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(all_documents)
if not chunks:
    raise ValueError("No chunks produced. Check document parsing.")
print(f"üìã Split into {len(chunks)} chunks (1000 char size, 200 overlap)")

# Create embeddings
embedding_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-3-small")
print("‚úÖ Embedding model initialized")

# Create semantic retriever (dense vector search)
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./venue_policies_chroma_db"
)
semantic_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
print("‚úÖ Semantic retriever created (Chroma + OpenAI embeddings)")

# Create BM25 retriever (keyword/sparse search)
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 5
print("‚úÖ BM25 retriever created (keyword-based)")

# Combine retrievers with Reciprocal Rank Fusion (RRF)
hybrid_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, semantic_retriever],
    weights=[0.4, 0.6]  # 40% keyword (BM25), 60% semantic
)
print("‚úÖ Hybrid retriever created (BM25 + Semantic with RRF)")

# Add Jina AI reranker on top of hybrid retrieval (3-stage retrieval)
compressor = JinaRerank(
    model="jina-reranker-v2-base-multilingual",
    top_n=3,  # Return top 3 after reranking
    jina_api_key=JINA_API_KEY
)

reranking_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=hybrid_retriever
)
print("‚úÖ Reranking retriever created (Jina Reranker v2)")
print("‚úÖ Complete retrieval system ready!")

‚úÖ Loaded 8 pages from MBS-Event-Policy.pdf
‚úÖ Loaded 8 pages from GBTB-Venue-Guide.pdf
‚úÖ Loaded 17 pages from Esplanade-Manual.pdf
‚úÖ Loaded 18 pages from SG-Event-Regulations.pdf

üìÑ Total pages loaded: 51
üìã Split into 115 chunks (1000 char size, 200 overlap)
‚úÖ Embedding model initialized
‚úÖ Semantic retriever created (Chroma + OpenAI embeddings)
‚úÖ BM25 retriever created (keyword-based)
‚úÖ Hybrid retriever created (BM25 + Semantic with RRF)
‚úÖ Reranking retriever created (Jina Reranker v2)
üéØ Complete retrieval system ready!


In [5]:
# Create events database
def setup_database():
    """Initialize the events database with sample data."""
    conn = sqlite3.connect('events.db')
    c = conn.cursor()

    # Create table if not exists
    c.execute('''
        CREATE TABLE IF NOT EXISTS events (
            id INTEGER PRIMARY KEY,
            name TEXT,
            type TEXT,  -- 'indoor' or 'outdoor'
            description TEXT,
            location TEXT,
            country TEXT,
            date TEXT
        )
    ''')

    today = datetime.now().date()
    def iso(days=0): return (today + timedelta(days=days)).isoformat()

    # Synthetic event data
    events = [
        ("Symphony Orchestra Gala", "indoor", "Classical symphony performance featuring renowned orchestra", "Esplanade Concert Hall, Singapore", "Singapore", iso(0)),
        ("Singapore Tech Summit", "indoor", "International technology conference for AI and digital innovation", "Marina Bay Sands Expo Centre, Singapore", "Singapore", iso(0)),
        ("Marina Bay Music Festival", "outdoor", "Music festival featuring pop and rock bands", "Gardens by the Bay, Singapore", "Singapore", iso(0)),
        ("Mumbai Music Street", "outdoor", "Live indie music performances", "Marine Drive, Mumbai", "India", iso(0)),
        ("Delhi Book Conclave", "indoor", "Writers and readers meet-up", "Pragati Maidan, New Delhi", "India", iso(0)),
        ("Bangkok Street Carnival", "outdoor", "Street performances and food stalls", "Siam Square, Bangkok", "Thailand", iso(0)),
        ("Thai Craft Showcase", "indoor", "Traditional Thai crafts and art", "Bangkok Art Center, Bangkok", "Thailand", iso(0)),
        ("Penang Heritage Walk", "outdoor", "Tour of George Town‚Äôs historic district", "George Town, Penang", "Malaysia", iso(0)),
        ("KL Coffee Expo", "indoor", "Coffee tasting and workshops", "KL Convention Centre, Kuala Lumpur", "Malaysia", iso(0)),
        ("Jakarta Film Screening", "indoor", "Indie film premieres", "Cinema XXI, Jakarta", "Indonesia", iso(0)),
        ("Bali Sunset Beach Fest", "outdoor", "Beach music and food event", "Canggu, Bali", "Indonesia", iso(0)),
        ("Hanoi Street Parade", "outdoor", "Music and cultural performances", "Old Quarter, Hanoi", "Vietnam", iso(0)),
        ("Hanoi Art Studio", "indoor", "Local artist exhibition", "French Quarter, Hanoi", "Vietnam", iso(0)),
        ("Manila Food Market", "outdoor", "Filipino cuisine and music", "Intramuros, Manila", "Philippines", iso(0)),
        ("Manila Tech Expo", "indoor", "Startup and innovation exhibition", "SMX Convention Center, Manila", "Philippines", iso(0)),
        ("Singapore Jazz Night", "indoor", "Regional jazz bands live", "Esplanade, Singapore", "Singapore", iso(1)),
        ("Singapore Botanic Fair", "outdoor", "Flower and plant exhibition", "Singapore Botanic Gardens, Singapore", "Singapore", iso(1)),
        ("Chennai Dance Gala", "indoor", "Classical Bharatanatyam showcase", "Music Academy, Chennai", "India", iso(1)),
        ("Goa Beach Fest", "outdoor", "Open-air music by the sea", "Baga Beach, Goa", "India", iso(1)),
        ("Bangkok Food Carnival", "outdoor", "Street food extravaganza", "Chatuchak Market, Bangkok", "Thailand", iso(1)),
        ("Bangkok Innovation Hub", "indoor", "Tech startups and product demos", "Siam Discovery, Bangkok", "Thailand", iso(1)),
    ]

    # Insert data safely
    c.executemany('''
        INSERT OR IGNORE INTO events (name, type, description, location, country, date)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', events)

    conn.commit()
    print("‚úÖ Database setup completed successfully. Events table is ready.")
    conn.close()

# Run setup
setup_database()

‚úÖ Database setup completed successfully. Events table is ready.


## Implementation

In [6]:
@tool
def retrieve_venue_policies(query: str) -> str:
    """
    Retrieve venue policies, restrictions, and requirements from Singapore event venues.
    Uses 3-stage retrieval: BM25 + semantic + Jina Reranker v2 for maximum relevance.
    
    Covers:
    - Photography/equipment restrictions (tripods, drones, professional gear)
    - Sound level limits and amplified music regulations  
    - Capacity limits and safety requirements
    - Accessibility features (wheelchair access, assisted listening)
    - Insurance requirements and liability coverage
    - MRT access, postal codes, and parking information
    - Technical specifications (stage dimensions, lighting, sound systems)
    
    Args:
        query: Question about venue policies (e.g., "tripod policy at Marina Bay Sands")
    
    Returns:
        Relevant policy excerpts with source attribution (venue name and page number)
    """
    # Use reranking retriever for best results
    retrieved_docs = reranking_retriever.invoke(query)
    
    # Format with source attribution for transparency
    formatted = []
    for doc in retrieved_docs:
        source = doc.metadata.get('source', 'Unknown')
        page = doc.metadata.get('page', '?')
        # Extract just the filename without path
        source_name = source.split('\\')[-1].split('/')[-1].replace('.pdf', '')
        formatted.append(f"[{source_name}, p.{page}]\n{doc.page_content}\n")
    
    return "\n".join(formatted)

# Store pending image requests for approval
pending_image_requests = {}

@tool
def request_image_generation(prompt: str, seed: int = 42, steps: int = 30) -> str:
    """
    Request to generate an image using Replicate API.
    This will ask for user approval before actually generating the image (costs money).
    
    Args:
        prompt: Description of the image to generate
        seed: Random seed for reproducibility (default: 42)
        steps: Number of generation steps (default: 30)
    
    Returns:
        A message indicating approval is needed
    """
    request_id = str(uuid.uuid4())
    pending_image_requests[request_id] = {
        "prompt": prompt,
        "seed": seed,
        "steps": steps
    }
    return f"üñºÔ∏è Image generation requested for: '{prompt}'\n\n‚ö†Ô∏è This will cost money via Replicate API. Please approve by calling 'approve_image_generation' with request_id: {request_id}"

@tool
def approve_image_generation(request_id: str) -> str:
    """
    Approve and execute a pending image generation request.
    
    Args:
        request_id: The ID of the pending image request to approve
    
    Returns:
        URL of the generated image or error message
    """
    if request_id not in pending_image_requests:
        return "‚ö†Ô∏è Invalid or expired request ID. No pending image generation found."
    
    request = pending_image_requests.pop(request_id)
    prompt = request["prompt"]
    seed = request["seed"]
    steps = request["steps"]
    
    try:
        output = replicate.run(
            "stability-ai/stable-diffusion-3.5-medium",
            input={"prompt": prompt, "seed": seed, "steps": steps}
        )

        # Handle unexpected response formats
        if isinstance(output, list):
            return output[0] if output else "‚ö†Ô∏è No image generated."
        elif hasattr(output, "url"):
            return output.url
        else:
            return str(output)
    except replicate.exceptions.ModelError as e:
        return f"‚ö†Ô∏è Image generation model error: {e}"
    except replicate.exceptions.ReplicateError as e:
        return f"‚ö†Ô∏è Replicate API error: {e}"
    except Exception as e:
        return f"‚ö†Ô∏è Unexpected image generation error: {e}"

@tool
def get_current_date() -> str:
    """
    Returns today's date in ISO format (YYYY-MM-DD).
    Use this tool when you need to know the current date for querying events or making date-based recommendations.
    """
    return datetime.now().date().isoformat()

In [7]:
@tool
def get_weather(location: str = 'Singapore') -> str:
    """
    Retrieve real-time weather data via the WeatherAPI.
    Takes a location as input and returns weather information including temperature and conditions.
    """
    url = "http://api.weatherapi.com/v1/current.json"
    params = {"key": WEATHER_API_KEY, "q": location, "aqi": "no"}
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        
        # Extract relevant weather information
        location_name = data['location']['name']
        country = data['location']['country']
        temp_c = data['current']['temp_c']
        condition = data['current']['condition']['text']
        
        return f"Weather in {location_name}, {country}: {temp_c}¬∞C, {condition}"
    except requests.exceptions.Timeout:
        return f"‚ö†Ô∏è Weather API request timed out for {location}."
    except requests.exceptions.ConnectionError:
        return f"‚ö†Ô∏è Network connection failed while fetching weather for {location}."
    except requests.exceptions.HTTPError as e:
        return f"‚ö†Ô∏è Weather API HTTP error: {e}"
    except KeyError:
        return f"‚ö†Ô∏è Unexpected weather data format for {location}."
    except Exception as e:
        return f"‚ö†Ô∏è Unexpected weather retrieval error: {e}"

In [8]:
@tool
def get_events(date: str, event_type: str | None = None, country: str = 'Singapore') -> str:
    """
    Retrieves event data by querying the SQLite database for events on a given date.
    Optionally filters by event_type (indoor/outdoor) and country (default: Singapore).
    Returns a formatted string of matching events.
    """
    conn = sqlite3.connect('events.db')
    c = conn.cursor()
    
    if event_type:
        c.execute('SELECT * FROM events WHERE date=? AND type=? AND country=?', (date, event_type, country))
    else:
        c.execute('SELECT * FROM events WHERE date=? AND country=?', (date, country))
    
    events = c.fetchall()
    conn.close()
    if not events:
        return f"No events found in {country} on {date}" + (f" ({event_type} type)" if event_type else " (all types)")
    
    # Format events nicely
    formatted_events = []
    for event in events:
        event_id, name, etype, desc, location, ecountry, edate = event
        formatted_events.append(
            f"- {name} ({etype}): {desc}. Location: {location}. Date: {edate}"
        )
    
    return "\n".join(formatted_events)

In [9]:
# Recommendation Tool - Simple LLM Chain

recommendation_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a helpful event recommender. Consider the weather conditions and suggest suitable events. 
    For outdoor events, consider the temperature and weather conditions. Be specific about why you recommend certain events over others. 
    Keep your response concise but informative. 
    If event data is unavailable, politely request the user for additional event-related information.
    If weather data is unavailable, provide a balanced mix of indoor and outdoor suggestions."""),
    ("user", "{weather_and_event_data}")
])

recommendation_chain = recommendation_prompt | ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY)

@tool
def recommend_events(weather_and_event_data: str) -> str:
    """
    Synthesize weather and event data into context-aware event recommendations.
    Takes combined weather and event information as input and returns personalized suggestions.
    
    Args:
        weather_and_event_data: Combined string containing weather conditions and available events
    
    Returns:
        A concise, personalized recommendation based on weather and events
    """
    try:
        # Invoke the chain with the input data
        result = recommendation_chain.invoke({"weather_and_event_data": weather_and_event_data})
        return result.content
    except Exception as e:
        return f"‚ö†Ô∏è Recommendation failed: {e}"

In [10]:
# Main Agent Setup - LangChain 1.0

tools = [
    retrieve_venue_policies,
    request_image_generation,
    approve_image_generation,
    get_current_date,
    get_weather,
    get_events,
    recommend_events
]

llm = ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY)

checkpointer = MemorySaver()

system_prompt = """You are an intelligent event planning assistant for Singapore.

**Your Role:**
Help users discover and attend events in Singapore by providing:
1. Event recommendations based on weather and preferences
2. Venue policy information for successful event attendance
3. Image generation for event visualization

**Venue Policy Knowledge:**
You have access to detailed policies from major Singapore venues via advanced hybrid RAG + Jina reranking:
- **Marina Bay Sands (MBS)**: Event policies, photography rules, technical specs, postal code 018956
- **Gardens by the Bay (GBTB)**: Sound restrictions, plant protection, photography policies, postal code 018953
- **Esplanade**: Performing arts facilities, technical specifications, accessibility features
- **Singapore Regulations**: PEL licensing, NEA sound limits, SCDF fire safety, insurance requirements

**Tool Usage Guidelines:**
1. Use 'retrieve_venue_policies' to answer questions about:
   - Photography/equipment restrictions (e.g., "Can I bring a tripod to Marina Bay Sands?")
   - Sound level limits (e.g., "What are the sound restrictions at Gardens by the Bay?")
   - Accessibility features (e.g., "Wheelchair access at Esplanade?")
   - Insurance requirements, capacity limits, MRT access, parking
   - Technical specifications (stage dimensions, lighting, sound systems)

2. Use 'request_image_generation' when the user asks for image generation - this will request approval first.
3. After the user approves, use 'approve_image_generation' with the provided request_id to actually generate the image.
4. Use 'get_current_date' when you need to know today's date for event queries or recommendations.
5. Use 'get_weather' to retrieve weather information for a specific location (default: Singapore).
6. Use 'get_events' to query events from the database by date, type (indoor/outdoor), and country.
7. Use 'recommend_events' to synthesize weather and event data into personalized recommendations.

**Multi-Step Recommendation Workflow:**
When the user asks for event recommendations:
1. First, call 'get_current_date' to know today's date
2. Call 'get_weather' to get weather conditions for the location
3. Call 'get_events' to retrieve available events for the date
4. Finally, call 'recommend_events' with the combined weather and event data to generate personalized suggestions

**Policy Query Best Practices:**
- Always cite specific policy sources when providing venue information (e.g., "[MBS-Event-Policy, p.2]")
- For location-specific queries, use exact terms like "Marina Bay Sands", "postal code 018956", "MRT CE1"
- For conceptual queries, use natural language like "accessibility features" or "sound restrictions"

Be helpful, concise, and informative in your responses. The 3-stage retrieval system provides highly relevant results for both exact-match queries (venue names, codes) and conceptual queries (accessibility, restrictions)."""

agent = create_agent(
    model=llm,
    tools=tools,
    system_prompt=system_prompt,
    checkpointer=checkpointer
)

In [11]:
# Chat Interface Functions - Updated for LangChain/LangGraph 1.0

# Thread configuration for conversation persistence
config = {"configurable": {"thread_id": "main_conversation"}}

def chat(user_input: str):
    """Send a message to the agent and display the response."""
    try:
        result = agent.invoke(
            {"messages": [HumanMessage(content=user_input)]},
            config=config
        )
        
        # Extract the last AI message
        if result and "messages" in result:
            messages = result["messages"]
            # Get the last assistant message
            ai_messages = [msg for msg in messages if isinstance(msg, AIMessage)]
            if ai_messages:
                output = ai_messages[-1].content
                print("AI:")
                display(Markdown(output))
                return {"output": output, "messages": messages}
            else:
                print("‚ö†Ô∏è No response from agent.")
                return {"output": "‚ö†Ô∏è No response generated.", "messages": messages}
        else:
            print("‚ö†Ô∏è Unexpected response format.")
            return {"output": "‚ö†Ô∏è Unexpected response format.", "messages": []}
    
    except Exception as e:
        print(f"‚ö†Ô∏è Unexpected error: {e}")
        traceback.print_exc()
        return {"output": f"‚ö†Ô∏è Unexpected error occurred: {e}"}

def chat_loop():
    """Start an interactive chat session."""
    print("Chat started! Type 'quit' to exit.\n")
    while True:
        try:
            user_input = input("You: ").strip()
            if user_input.lower() in ['quit', 'exit', 'q']:
                print("Goodbye!")
                break
            if not user_input:
                continue
            # print(f"You: {user_input}")
            chat(user_input)
        except KeyboardInterrupt:
            print("\nüõë Session interrupted by user.")
            break
        except Exception as e:
            print(f"‚ö†Ô∏è Error during chat loop: {e}")
            continue

def reset_conversation():
    """Reset the conversation by creating a new thread."""
    global config
    config = {"configurable": {"thread_id": str(uuid.uuid4())}}
    print("‚úÖ Conversation history cleared. Starting fresh.")

## Testing

In [None]:
chat_loop()

# END