# Setup

In [1]:
import torch
import spacy
from spacy import displacy
import IPython
from IPython.display import display, HTML

import pandas as pd
import numpy as np

from datasets import Dataset
from transformers import BertTokenizer, BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer,  DataCollatorWithPadding

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

#from huggingface_hub import login
#login()

print(IPython.__version__)

  from .autonotebook import tqdm as notebook_tqdm


8.37.0


In [2]:
print("PyTorch version:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device is available and selected.")
else:
    device = torch.device("cpu")
    print("MPS device not available. Using CPU.")

x = torch.tensor([1, 2, 3], device=device)
print(x.device)

PyTorch version: 2.7.1
MPS available: True
MPS device is available and selected.
mps:0


In [3]:
# Load spaCy model
nlp = spacy.load("en_core_web_trf")

In [4]:
# All spaCy labels
print(nlp.get_pipe("ner").labels)

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


| **Label**  | **Purpose in Your Agent**                                                      |
| ---------- | ------------------------------------------------------------------------------ |
| `GPE`      | Detect cities, countries, districts (e.g., *Singapore*, *Kampong Glam*)        |
| `LOC`      | Identify general locations (e.g., *Marina Bay*, *Orchard Road*)                |
| `FAC`      | Capture facilities like *MRT*, *airport*, *hotel*                              |
| `DATE`     | Recognize travel dates (e.g., *3 June*, *next Monday*)                         |
| `TIME`     | Times of day for activities or bookings (e.g., *10 AM*, *evening*)             |
| `DURATION` | Trip length or durations (e.g., *3 days*, *2 nights*)                          |
| `ORG`      | Travel operators, hotel chains, airlines (e.g., *Expedia*, *Marina Bay Sands*) |
| `MONEY`    | Budget, pricing (e.g., *SGD 100*, *\$200*)                                     |
| `PERSON`   | User or people mentioned in dialogue (for chatbot personalization if needed)   |
| `EVENT`    | Named events or festivals (e.g., *Singapore Night Festival*)                   |
| `CARDINAL` | Generic numbers (e.g., *2 adults*, *4 attractions*)                            |
| `ORDINAL`  | Day number in trip or itinerary step (e.g., *first day*, *3rd night*)          |


# NLP Modules

In [5]:
# Rule-based dictionary of known terms
FOOD_TERMS = {'rojak', 'prawn mee', 'bubur cha cha', 'mee siam', 'itek-itek', 'popiah', 
              'fish head curry', 'pongal', 'kueh pie tee', 'char kway teow', 
              'hainanese chicken rice', 'mee goreng', 'wonton mee', 'nasi lemak', 
              'beef rendang', 'bakuteh', 'nasi padang', 'teochew porridge', 
              'yong tau foo', 'char koay teow', 'kueh salat', 'bak chor mee', 'chicken wings', 
              'curry puffs', 'kong bah pau', 'oyster omelette', 'bak kut teh', 'har jeong gai', 
              'kway chap', 'mee rebus ayam', 'laksa', 'mee rebus', 'wan tan mee', 'otah-otah', 
              'carrot cake', 'ayam buah keluak', 'satay', 'lor mee'}

TRANSPORT_TERMS = {"mrt", "ez-link", "bus pass", "circle line", "east west line"}
LOCATION_TERMS = {"marina bay", "kampong glam", "chinatown", "sentosa"}

SPECIAL_REQUIREMENT_TERMS = [
    # Accessibility
    "wheelchair", "disabled", "elderly", "mobility", "ramp", "accessible", "no stairs",
    
    # Dietary
    "halal", "vegetarian", "vegan", "gluten-free", "kosher",
    
    # Kid/baby-friendly
    "stroller", "baby", "infant", "kid-friendly", "child seat",
    
    # Pet-related
    "pet-friendly", "pets allowed", "dog", "cat", "no pets",

    # Sensory/environmental
    "quiet", "no smoking", "non-smoking", "low crowd", "avoid crowded", "no stairs", "no noise"
]

In [6]:
# Define essential fields that must be filled
ESSENTIAL_FIELDS = ["intent", "location", "date", "duration_days"]

## Load Trained Intent Detection Model

In [7]:
# Load trained model and tokenizer
model_path = "./intent_model"  # Change to your model directory
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Put model in eval mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Classify Intent

In [8]:
label2id = {'FindPlace': 0, 'BookFlight': 1, 'AskOpeningHour': 2, 'SearchHotel': 3, 'PlanItinerary': 4}
id2label = {v: k for k, v in label2id.items()}

In [9]:
def classify_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    pred_id = torch.argmax(outputs.logits, dim=1).item()
    return id2label[pred_id]

## Understand the Entities

In [11]:
def understand_entities(text):
    doc = nlp(text)
    
    # Render dependencies
    dep_html = displacy.render(doc, style="dep", page=True)
    display(HTML(dep_html))

    # Render named entities
    ent_html = displacy.render(doc, style="ent")
    display(HTML(ent_html))

    for token in doc:
        print(f"{token.text:15} | Lemma: {token.lemma_:10} | POS: {token.pos_:8} | Tag: {token.tag_:6} | Dep: {token.dep_:12} | "
        f"Shape: {token.shape_:10} | Alpha: {token.is_alpha} | Stop: {token.is_stop}")

    return "Dependency tree saved to 'dep_tree.html', Entity visualization saved to 'entities.html'."

## Initial the Dialogue State

In [12]:
# Define base dialogue state format
def init_dialogue_state():
    return {
          "intent": None,
          "location": [],
          "date": [],
          "duration_days": [],
          "food": [],
          "budget": [],
          "transport": [],
          "event": [],
          "style": [],                   
          "num_kids": [],
          "num_adults": [],
          "special": [],
    }

# x Destination: Singapore, Sentosa
# Persona: family with kids, solo traveler
# Activity: shopping, food, studying
# Accommodation: 4-star hotel, budget hotel
# x Transport: flight, car, train, cruise, ferry
# x Duration: 3 days, two weeks
# x Date: July, 10 June 2025
# x Scope (Intent): overall trip planning, accommodation advice, food advice
# Tip: best time to visit, weather in December
# x Budget: under $1000", luxurious
# Custom: visa, passport validity
# more..


## Extract Entities

In [13]:
# Mapping spaCy labels to our labels
spacy_to_custom_labels = {
    "GPE": "location",
    "LOC": "location",
    "FAC": "location",
    "DATE": "date",
    "TIME": "date",
    "DURATION": "date",
    "EVENT": "event",
    "MONEY": "budget",
}

### Time Extraction

In [14]:
from symspellpy.symspellpy import SymSpell, Verbosity

# Initialize (load once)
sym_spell = SymSpell(max_dictionary_edit_distance=2)
sym_spell.load_dictionary("./frequency_dictionary_en_82_765.txt", 0, 1)

def correct_text(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    return suggestions[0].term if suggestions else text

In [15]:
from rapidfuzz import fuzz, process

def fuzzy_match_time_unit(word):
    time_units = ["day", "days", "week", "weeks", "month", "months", "night", "nights"]
    result = process.extractOne(word, time_units, scorer=fuzz.ratio)
    if result:
        match, score = result[0], result[1]
        return match if score >= 80 else None
    return None

In [16]:
def classify_time_entity_fuzzy(ent):
    import re

    text = ent.text.lower().strip()
    corrected = correct_text(text)

    # Patterns
    month_keywords = [
        "january", "february", "march", "april", "may", "june",
        "july", "august", "september", "october", "november", "december",
        "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec"
    ]

    weekday_keywords = [
        "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
        "mondays", "tuesdays", "wednesdays", "thursdays", "fridays", "saturdays", "sundays"
    ]

    # If it looks like a range, treat it as date
    if re.search(r"\b(from|to|until|between)\b", corrected):
        return "date"

    # If contains explicit duration unit like "days", "weeks"
    unit = fuzzy_match_time_unit(corrected.split()[-1])
    if unit and any(char.isdigit() for char in corrected):
        return "duration_days"

    if any(month in corrected for month in month_keywords):
        return "date"

    if any(day in corrected for day in weekday_keywords):
        return "date"

    # Detect YYYY-MM-DD or DD/MM/YYYY patterns
    if re.search(r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}", corrected) or re.search(r"\d{4}-\d{2}-\d{2}", corrected):
        return "date"

    # Fallback to original label
    return ent.label_.lower()


In [17]:
from dateutil import parser

def normalize_date(text):
    try:
        dt = parser.parse(text, fuzzy=True, dayfirst=True)
        return dt.strftime('%Y-%m-%d')
    except:
        return None

In [18]:
def is_potential_noise_date(ent):
    text = ent.text.strip().lower()
    # If it's a plain number, be skeptical
    if text.isdigit():
        num = int(text)
        # Ages, counts, room numbers, etc.
        if 0 < num <= 30:
            # Check surrounding context for non-temporal signals
            left = ent.doc[max(ent.start - 2, 0):ent.start]
            right = ent.doc[ent.end:min(ent.end + 2, len(ent.doc))]
            window = " ".join([t.text.lower() for t in list(left) + list(right)])
            if re.search(r"(age|year[- ]?old|kid|child|room|group|people|class|seat)", window):
                return True
    # Also guard against short dates like "8" or "12"
    if len(text) <= 2 and text.isdigit():
        return True
    return False


from datetime import datetime

def compute_duration_from_dates(date_range):
    start = datetime.strptime(date_range["start"], "%Y-%m-%d")
    end = datetime.strptime(date_range["end"], "%Y-%m-%d")
    duration = (end - start).days
    return f"{duration} days" if duration > 0 else "Invalid date range"

In [19]:
import re

def is_relative_day(text):
    """
    Check if the input string refers to a relative day (e.g., today, tomorrow, next week).
    """
    text = text.lower().strip()

    relative_keywords = [
        "today", "tomorrow", "tonight", "yesterday",
        "this morning", "this evening", "this afternoon",
        "next week", "next month", "next year",
        "this week", "this month", "this year",
        "coming weekend", "this weekend", "next weekend"
    ]

    # Match relative phrases
    for phrase in relative_keywords:
        if phrase in text:
            return True

    # Regex to catch things like "in 3 days", "after 1 week", "within 2 months"
    relative_pattern = r"(in|after|within)\s+\d+\s+(day|days|week|weeks|month|months|year|years)"
    if re.search(relative_pattern, text):
        return True

    return False

### NER Extraction

In [20]:
# Extract using spaCy
def extract_with_spacy(text):
    doc = nlp(text)
    state = init_dialogue_state()
    
    for ent in doc.ents:
        term = ent.text.strip().lower()
        label = ent.label_
        
        # Handle location
        if label in ["GPE", "LOC"]:
            if term not in TRANSPORT_TERMS and term not in state["location"]:
                print(term)
                state["location"].append(term)
            continue
            
        # Handle DATE vs. duration vs. relative day
        if label == "DATE":
            if is_potential_noise_date(ent):
                continue  # skip likely misclassified number
            unit = classify_time_entity_fuzzy(ent)
            if unit == "date":
                if is_relative_day(term):
                    pass
                else:
                    norm = normalize_date(term)
                    if norm and norm not in state["date"]:
                        state["date"].append(norm)
            elif unit == "duration_days":
                if term not in state["duration_days"]:
                    state["duration_days"].append(term)
            continue  # skip further processing of this term
            
        # Mapped labels
        mapped_label = spacy_to_custom_labels.get(label, None)
        if mapped_label and term not in state[mapped_label]:
            state[mapped_label].append(term)        
        
        # Keyword-based overrides
        if term in FOOD_TERMS and term not in state["food"]:
            state["food"].append(term.lower())
        elif term in TRANSPORT_TERMS and term not in state["transport"]:
            state["transport"].append(term.lower())
        elif term in LOCATION_TERMS and term not in state["location"]:
            pass

    # Fallback token-level matches
    for token in doc:
        word = token.lemma_.lower()
        
        if word in FOOD_TERMS and word not in state["food"]:
            state["food"].append(word)
        elif word in TRANSPORT_TERMS and word not in state["transport"]:
            state["transport"].append(word)
        elif word in LOCATION_TERMS and word not in state["location"]:
            pass

    return state

In [21]:
test = "We’re interested in Singapore’s local culture and historical sights 7 July, but would prefer simplified names or explanations for easier understanding."
# print(extract_with_spacy(test))
result = extract_with_spacy(test)

for key, value in result.items():
    print(f"{key}: {value}")

singapore
intent: None
location: ['singapore']
date: ['2025-07-07']
duration_days: []
food: []
budget: []
transport: []
event: []
style: []
num_kids: []
num_adults: []
special: []


## Check Missing Essential Entities

In [22]:
def missing_fields(parsed_response):
    missing = []
    for field in ESSENTIAL_FIELDS:
        value = parsed_response.get(field)
        if value is None or (isinstance(value, list) and len(value) == 0):
            missing.append(field)
    return missing

In [23]:
def extract_or_clarify(user_query):
    # Try quick parse from your own rules or previous step
    parsed = quick_parse(user_query)  # Assume you have a lightweight parser

    missing = missing_fields(parsed)

    if missing:
        print(f"🤖 I need more info. Could you please provide: {', '.join(missing)}?")
        return None  # Await user's clarification
    else:
        return parsed


## Build Prompt

In [24]:
def build_prompt(user_query):
    return f"""
You are a travel assistant AI that extracts **only explicitly stated information** from user messages to help plan an itinerary.

🔍 Your task is to extract fields from the user query **without guessing or inferring**.  
- If something is not clearly mentioned, return `None` (for single values) or `[]` (for lists).  
- Normalize any fuzzy or descriptive terms to a known travel-friendly format.

Return a valid Python dictionary using this structure:
```python
dialogue_state = {{
    "location": List[str],             # Places or landmarks mentioned
    "date": List[str],                 # Exact or relative dates (e.g., "June 3", "next week")
    "duration_days": List[str],        # Durations like "3 days", "a week"
    "food": List[str],                 # Local foods, cuisines
    "budget": List[str],               # Budget phrases like "$300", "under $150"
    "transport": List[str],            # Modes of travel: MRT, bus, taxi
    "event": List[str],                # Activities like shopping, sightseeing, museum
    "style": List[str],                # Descriptive preferences: relaxed, luxury, tourist-friendly
    "num_kids": List[str],             # Number of children (if mentioned)
    "num_adults": List[str],           # Number of adults (if mentioned)
    "special": List[str]               # Special needs: halal, wheelchair access, baby stroller, others
}}

🗣 User Query:
"{user_query}"

🔚 Respond with only the dictionary. Do not include explanations, prefixes, or formatting like triple quotes.
"""

## Call LLM when Fallback

In [25]:
import ast
import json
import requests

def call_ollama_mistral(user_query):
    prompt = build_prompt(user_query)

    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={"model": "mistral", "prompt": prompt},
            stream=True,
            timeout=30
        )
    except requests.RequestException as e:
        print("❌ Ollama request failed:", e)
        return {}

    response_text = ""
    for line in response.iter_lines():
        if line:
            try:
                part = json.loads(line)
                if "response" in part:
                    response_text += part["response"]
            except json.JSONDecodeError:
                continue

    try:
        # Remove "dialogue_state =" or any prefix
        dict_start = response_text.find("{")
        dict_str = response_text[dict_start:].strip()

        # ✅ Use ast.literal_eval for Python-style literal
        parsed = ast.literal_eval(dict_str)
        return parsed
    except Exception as e:
        print("⚠️ Ollama dict parse error:", str(e))
        print("🔁 Raw response:", response_text)
        return {}

# Test ollama mistral    
user_query = 'Hello world!'
response = call_ollama_mistral(user_query)
print(response)


{'location': None, 'date': None, 'duration_days': None, 'food': None, 'budget': None, 'transport': None, 'event': None, 'style': None, 'num_kids': None, 'num_adults': None, 'special': None}


## Dialogue State Management

In [26]:
def merge_states(primary, fallback):
    merged = primary.copy()
    for key in merged:
        is_empty = merged[key] in [None, [], {}]
        has_fallback = fallback.get(key) not in [None, [], {}]
        if is_empty and has_fallback:
            merged[key] = fallback[key]
    return merged

In [27]:
def handle_non_itinerary_intent(intent, user_query):
    responses = {
        "FindPlace": "📍 I can help you find a place! What type of place are you looking for?",
        "BookFlight": "✈️ Sure! I can help you book a flight. When and where do you want to travel?",
        "AskOpeningHour": "⏰ Please tell me which place you'd like to know the opening hours for.",
        "SearchHotel": "🏨 Looking for a hotel? Let me know your destination and budget.",
    }
    return responses.get(intent, "🤖 I'm not sure how to help with that yet.")

In [28]:
import re

def detect_additional_signals(text):
    text_lower = text.lower()

    # Direct phrase match
    for term in SPECIAL_REQUIREMENT_TERMS:
        if term in text_lower:
            return True, term

    # Regex patterns for common needs
    patterns = [
        r"\b(no\s+stairs|no\s+smoking|non[-\s]?smoking)\b",
        r"\b(kid[-\s]?friendly|baby[-\s]?friendly|pet[-\s]?friendly)\b",
        r"\b(gluten[-\s]?free|wheelchair[-\s]?accessible)\b",
        r"\b(halal|kosher|vegetarian|vegan)\b",
        r"\b(avoid\s+(crowds|crowded))\b",
    ]

    for pattern in patterns:
        if re.search(pattern, text_lower):
            return True, pattern

    # Fallback: check noun chunks with spaCy
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        chunk_text = chunk.text.lower()
        if any(term in chunk_text for term in SPECIAL_REQUIREMENT_TERMS):
            return True, chunk_text

    return False, None


In [29]:
from sentence_transformers import SentenceTransformer, util

st_model = SentenceTransformer("all-MiniLM-L6-v2")

NEW_PLAN_EXAMPLES = [
    "plan a new trip",
    "start a new itinerary",
    "create a new travel plan",
    "begin another journey",
    "forget the last trip",
    "make a new plan",
    "we want to go somewhere else now",
    "i'm planning a different trip",
    "start over",
    "next, I want to plan something new"
]

def is_new_plan(user_query, threshold=0.7):
    query_emb = st_model.encode(user_query, convert_to_tensor=True)
    example_embs = st_model.encode(NEW_PLAN_EXAMPLES, convert_to_tensor=True)

    similarity_scores = util.cos_sim(query_emb, example_embs)[0]
    max_score = float(similarity_scores.max())

    return max_score > threshold


## Confirmation

In [30]:
FINAL_CONFIRM_PHRASES = {
    "looks good", "that's fine", "okay", "confirm", "yes", "that works",
    "good to go", "done", "finalize", "proceed", "complete the plan"
}

def is_final_confirmation(text):
    return text.strip().lower() in FINAL_CONFIRM_PHRASES

In [31]:
YES_PHRASES = {"yes", "yeah", "sure", "of course", "yep", "affirmative", "let's go", "ok", "okay"}

def is_affirmative(text):
    return text.strip().lower() in YES_PHRASES

## RAG Simulator

In [32]:
def sent_prompt_to_llm(prompt):
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={"model": "mistral", "prompt": prompt},
            stream=True,
            timeout=30
        )
    except requests.RequestException as e:
        print("❌ Ollama request failed:", e)
        return {}

    # Stream and collect response chunks
    output = ""
    for line in response.iter_lines():
        if line:
            try:
                data = json.loads(line.decode("utf-8"))
                output += data.get("response", "")
            except json.JSONDecodeError:
                continue  # Ignore malformed lines

    return output

# rag vector db simulator to generate factual context based on similarity
import json
def rag_vdb_sim(fact_count):
    if fact_count <= 0:
        return []
    elif fact_count > 3:
        fact_count = 3 # keep max at 3 to reduce tokens and speed up the response.
    
    response = sent_prompt_to_llm(
        f"You are a travel domain assistant. Generate {fact_count} factual travel-related information about Singapore. "
        "Return the output strictly as a JSON array of strings, where each array element contains one fact. "
        "Do not include any explanations or formatting outside the array."
    )
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        print("Failed to parse response into JSON array.", response)
        return []    

rag_chunks = rag_vdb_sim(3)
for chunk in rag_chunks:
    print(chunk)

Singapore is the world's third-largest financial center after New York City and London.
The Merlion, a mythical creature with a lion's head and the body of a fish, is the mascot and national personification of Singapore.
Gardens by the Bay, a 101-hectare park in central Singapore that consists of three waterfront gardens, is home to Supertree Grove, iconic vertical gardens up to 50 meters tall.


## Persona Detection

In [39]:
from transformers import BertTokenizerFast, AutoModelForSequenceClassification, RobertaTokenizerFast
import torch
import joblib
import numpy as np

# Load the tokenizer exactly the same as training
persona_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
persona_model = AutoModelForSequenceClassification.from_pretrained("./bert_multilabel_persona/checkpoint-2155")
persona_label_encoder = joblib.load("./bert_multilabel_persona/label_encoder.bin")  # This is a MultiLabelBinarizer

# persona_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
# persona_model = AutoModelForSequenceClassification.from_pretrained("./roberta_multilabel_persona/checkpoint-2155")
# persona_label_encoder = joblib.load("./roberta_multilabel_persona/label_encoder.bin")  # This is a MultiLabelBinarizer

persona_model.eval()

def predict_personas(user_query, threshold=0.5):
    inputs = persona_tokenizer(user_query, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = persona_model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits).squeeze().numpy()  # Convert logits to probabilities

    # Predict all classes above the threshold
    predicted_labels = np.where(probs >= threshold)[0]
    personas = persona_label_encoder.classes_[predicted_labels]
    return list(personas)


# Test
user_queries = [
    "We're 6 young adults (25-30) staying in Singapore for 6 days. We love outdoor activities, hiking trails, cycling, and unique experiences like night safaris. Include one rest day and show us images of adventure activities available.",
    "A family with a child in a wheelchair, maximize sightseeing in 3 days in Singapore",
    "We're tech conference attendees, maximize sightseeing in 3 days in Singapore with MRT travel",
    "We want a wellness retreat, maximize sightseeing in 3 days in Singapore for a weekend",
    "I'm a solo traveler, experience something unique in Singapore (from July 1 to July 5) avoiding crowded places",
    "A couple planning a honeymoon, experience something unique in Singapore (sometime in March) for a weekend including vegan options",
]
for query in user_queries:
    personas = predict_personas(query, threshold=0.5)
    print(personas)


['Adventure Seeker', 'Family Traveler']
['Adventure Seeker', 'Family Traveler']
[]
['Adventure Seeker', 'Luxury Seeker', 'Relaxation Seeker']
['Solo Traveler']
['Family Traveler']


## Main Pipeline

In [None]:
from prompt_gen import generate_prompt, generate_response

def process_user_input(user_query, dialogue_state=None, plan_finalized=False):
    """
    Travel chatbot processor for PlanItinerary (single intent only).
    
    Features:
    - Detects if user wants to start a new plan after one was proposed.
    - Handles structured dialogue (AI Agent / User).
    - Extracts essential fields via spaCy.
    - Calls LLM for additional details if signals found.
    """
    # Step 0: Initialize state if needed
    if dialogue_state is None:
        dialogue_state = init_dialogue_state()
        plan_finalized = False
       
    # Step 1: Handle post-finalization message
    if plan_finalized:
        if is_new_plan(user_query) or is_affirmative(user_query):
            dialogue_state = init_dialogue_state()
            plan_finalized = False
            return dialogue_state, "🆕 Got it! Starting a new itinerary. Let's begin.", plan_finalized
        else:
            return dialogue_state, (
                "🤖 Would you like to plan a new itinerary? "
                "You can say something like 'Plan a new trip to Tokyo'."
            ), plan_finalized

    # Step 2: Classify intent if not set
    if dialogue_state["intent"] is None:
        intent = classify_intent(user_query)
        if intent == "PlanItinerary":
            dialogue_state["intent"] = intent
        else:
            return dialogue_state, "❌ I can only help with itinerary planning for now.", plan_finalized

    # Step 3: Extract essential fields with spaCy
    spacy_state = extract_with_spacy(user_query)
    dialogue_state = merge_states(dialogue_state, spacy_state)

    # Only infer duration if not already provided explicitly #HACK
    # if not dialogue_state["duration_days"] and len(dialogue_state["date"]) >= 2:
    #     duration_str = compute_duration_from_dates(dialogue_state["date"])
    #     if duration_str:
    #         dialogue_state["duration_days"].append(duration_str)

    # Step 4: Check for missing essential fields
    essential_fields = ["intent"]
    # essential_fields = ["intent","location"]
    # essential_fields = ["location", "date"]
    # essential_fields = ["location", "date", "duration_days"]
    missing = [key for key in essential_fields if not dialogue_state[key]]

    if missing:
        return dialogue_state, (
            f"⚠️ I still need the following: {', '.join(missing)}.\n"
            "Could you please provide that?"
        ), plan_finalized

    #Step 5: Extract additional info if signals detected 
    if detect_additional_signals(user_query):
        llm_state = call_ollama_mistral(user_query)
        dialogue_state = merge_states(dialogue_state, llm_state)
    
    # Step 6: Finalize if user confirms
    if is_final_confirmation(user_query):
        plan_finalized = True
        response_msg = (
            "🎉 Great! Your itinerary has been finalized:\n"
            f"```json\n{json.dumps(dialogue_state, indent=2)}\n```\n"
            "Would you like to plan another trip?"
        )
        return dialogue_state, response_msg, plan_finalized

    # Step 7: Detect Persona
    dialogue_state['persona'] = predict_personas(user_query, threshold=0.5)

    # Step 8: Generate prompt
    # rag_chunks = rag_vdb_sim(3)
    rag_chunks = rag_vdb_sim(0)
    prompt = generate_prompt(user_query, dialogue_state, rag_chunks, 200)
    # return dialogue_state, prompt

    # Step 9: Send prompt to LLM to get travel advice
    # travel_advice = sent_prompt_to_llm(prompt)
    travel_response = generate_response(prompt)


    # # Step 10: Show proposed plan and wait for confirmation
    response_msg = (
        "✅ Here's your current itinerary summary:\n"
        f"{travel_response} \n\n"
        f"```json\n{json.dumps(dialogue_state, indent=2)}\n```\n"
        "Would you like to add or modify anything?\n"
        "Say 'looks good' or 'confirm' to finalize."
    )
    return dialogue_state, response_msg, plan_finalized, prompt

In [36]:
user_query = "We are a family of four—two adults and two kids aged 5 and 8—planning a trip to Singapore."
user_query = "We are a family of four—two adults and two kids aged 5 and 8—planning a trip to Singapore in 6 July for 3 days."
user_query = "We're a family of 4 with two children aged 6 and 9 visiting Singapore for 5 days. We love interactive science exhibits, nature parks, and kid-friendly activities. Can you suggest an itinerary with one rest day in the middle? Show us relevant attractions with images."
user_query = "We're 6 young adults (25-30) staying in Singapore for 6 days. We love outdoor activities, hiking trails, cycling, and unique experiences like night safaris. Include one rest day and show us images of adventure activities available."
user_query = "We're a group of 8 spanning three generations (grandparents, parents, teens) visiting for 7 days. Need activities suitable for all ages including accessible attractions, traditional food experiences, and family-friendly entertainment. Plan one rest day mid-week."
process_user_input(user_query)


({'intent': 'PlanItinerary',
  'location': ['None'],
  'date': ['2025-06-07'],
  'duration_days': ['7 days'],
  'food': ['traditional food experiences'],
  'budget': ['None'],
  'transport': ['None'],
  'event': ['activities suitable for all ages',
   'accessible attractions',
   'family-friendly entertainment'],
  'style': ['None'],
  'num_kids': ['None'],
  'num_adults': ['8'],
  'special': ['accessible attractions'],
  'persona': ['Family Traveler']},
 "You are a highly experienced travel expert and advisor for Singapore. Your task is to provide a well-structured and practical travel itinerary based on the user's needs.\n\nHere is the user's original request:\n We're a group of 8 spanning three generations (grandparents, parents, teens) visiting for 7 days. Need activities suitable for all ages including accessible attractions, traditional food experiences, and family-friendly entertainment. Plan one rest day mid-week.\n\nSpecifically, as a Family Traveler , the user wants to do Pla

In [41]:
import pandas as pd
import csv

# df = pd.read_csv("user_queries1.csv")
df = pd.read_csv("user_queries.csv")
print(df.head())

df['StructuredQuery'] = df['RawQuery'].apply(lambda q: process_user_input(q)[1]) #the fourth value in the returned tuple is the prompt_template
# df['StructuredQuery'] = df['StructuredQuery'].apply(lambda x: f'"{x}"')

# df.to_csv("structured_queries.csv", index=False, quoting=0)  # quoting=0 means no automatic quoting
df.to_csv("structured_queries_persona_bert.csv", index=False, quoting=0)  # quoting=0 means no automatic quoting

                                            RawQuery  StructuredQuery
0  We're a family of 4 with two children aged 6 a...              NaN
1  My husband and I (both 65+) are visiting Singa...              NaN
2  We're 6 young adults (25-30) staying in Singap...              NaN
3  Three colleagues extending our business trip f...              NaN
4  We're a group of 8 spanning three generations ...              NaN
singapore
singapore
little india
singapore
gardens by the bay
marina bay
singapore
singapore
singapore
laksa
kampong gam
marina bay
marina bay
sentosa
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
singapore
⚠️ Ollama dict parse error: malformed node or string on line 10: <ast.Call object at 0x39fdbe8c0>
🔁 Raw response:  {
    "location": ["Singapore"],
    "date": None,
    "duration_days": ["3 days"],
    "food": [],
    "budget": None,
    "transport

# Chatbot Dialogue

In [207]:
import gradio as gr

def chat_fn(message, history, state_bundle):
    dialogue_state, plan_finalized = state_bundle

    # Process input
    updated_state, reply, plan_finalized = process_user_input(message, dialogue_state, plan_finalized)

    # Append to history
    history.append((f"User: {message}", f"🤖AI Agent: {reply}"))

    return history, [updated_state, plan_finalized], ""  # Update both in Gradio state

CSS ="""
.contain { display: flex; flex-direction: column; }
#chatbot { flex-grow: 1; }
#component-0 { height: 100%; }
#component-1 { height: 800px; }
"""
with gr.Blocks(css=CSS) as demo:
    gr.Markdown("# Personal AI Travel Agent (PAT)")           # Visible big title in UI
    gr.Markdown("Where do you want to go today?")  # Description text

    chatbot = gr.Chatbot(
        elem_id="chatbot",
        height="100vh",
    )
    state = gr.State([init_dialogue_state(), False])  # 🧠 Bundle: [dialogue_state, plan_finalized]
    history = gr.State([])
    finalized_flag = gr.State(False)
    msg = gr.Textbox(label="Type your message here")
    msg.submit(
        fn=chat_fn,
        inputs=[msg, history, state],
        outputs=[chatbot, state, msg]
    )

demo.launch()


  chatbot = gr.Chatbot(


* Running on local URL:  http://127.0.0.1:7905
* To create a public link, set `share=True` in `launch()`.




singapore
Failed to parse response into JSON array.  [
  "Singapore is the world's only island city-state, located at the tip of the Malay Peninsula in Southeast Asia.",
  "The Singapore Zoo, known as Mandai Park, is renowned for its open-concept exhibits and has been named the best rainforest zoo in the world by BBC Wildlife Magazine.",
  "Raffles Hotel, a colonial-style luxury hotel, was established in 1887 and is where the Singapore Sling cocktail was first served."


In [80]:
# Example input
sample_input = [
    "I want to plan an itinerary that includes laksa, Kampong Gam and Marina Bay, and maybe take the MRT from 3 June 2025 to 9 June 2025.",
    "Plan a 3-day trip around Marina Bay and Sentosa, eat laksa, take the MRT, and spend under $150. On Day 2 we want to shop.",    
    "We are a family of four—two adults and two kids aged 5 and 8—planning a trip to Singapore.",
    "We'll be in Singapore for 6 days including arrival and departure, so we have 4 full days to explore.",
    "Please suggest daytime activities in Singapore that are fun and creative for young children.",
    "After our kids go to bed, we’d like some recommendations for fun and fashionable adult activities at night.",
    "We’re interested in Singapore’s local culture and historical sights, but would prefer simplified names or explanations for easier understanding.",
    "We prefer a relaxed itinerary: maximum two attractions per day, with one rest day in the middle to explore freely.",
    "Please generate a 6-day itinerary for our family trip to Singapore, considering only 4 full days for activities."
]


## Output the NER Extraction

In [None]:
#sample_dialogue_state = process_user_input(sample_input)
for s in sample_input:
    test = understand_entities(s)
    print(test)

In [None]:
for i, s in enumerate(sample_input):
    print("=" * 60)
    print(f"Sample: {i + 1}")
    print("=" * 60)
    
    dialogue_state = init_dialogue_state()
    sample_output = process_user_input(s, dialogue_state)
    print(sample_output)