In [1]:
import pandas as pd
import re
import numpy as np
from IPython.display import display
import chromadb_lib as cdb
import flight_utils as fu
import hotels_utils as hu


import google.generativeai as genai


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
GEMINI_API_KEY = "AIzaSyBNHTOBkzNbpcywUrTWruS3d_wFKTayPqA"
genai.configure(api_key=GEMINI_API_KEY)

In [11]:
hotel1 = pd.read_csv("rome_five_star_hotel.csv")
hotel2 = pd.read_csv("rome_four_star_hotels.csv")
hotels_df = pd.concat([hotel1, hotel2], ignore_index=True)
flights_df = pd.read_csv("flights_data.csv") 

In [None]:
from datetime import date, datetime
from typing import Optional, Literal
from pydantic import BaseModel, ValidationError
import json

TimeWindow = Literal["morning","afternoon","evening","night"]

class TripQuery(BaseModel):
    origin: str
    destination: str
    start_date: Optional[date] = None
    end_date: Optional[date] = None
    adults: int = 1
    children: int = 0
    cabin: Literal["ECONOMY","PREMIUM_ECONOMY","BUSINESS","FIRST"] = "ECONOMY"
    depart_window: Optional[TimeWindow] = None
    arrive_window: Optional[TimeWindow] = None
    return_window: Optional[TimeWindow] = None
    hotel_country: Optional[str] = None
    hotel_city: Optional[str] = None
    hotel_min_rating: Optional[float] = None
    hotel_prefs_text: Optional[str] = None

def extractor(text: str) -> Optional[TripQuery]:
    """
    Parses user text into a TripQuery object using a Generative AI model.
    """
    # Initialize the Generative Model with JSON mode enabled
    try:
        model = genai.GenerativeModel(
            'gemini-2.5-flash',
            generation_config={"response_mime_type": "application/json"}
        )
    except Exception as e:
        print(f"Error initializing the model: {e}")
        return None

    # Pydantic's `model_json_schema` generates a schema the LLM can follow
    schema = TripQuery.model_json_schema()

    # The prompt provides the context, instructions, the schema, and the user text.
    # This guides the LLM to perform the extraction task accurately.
    prompt = f"""
    You are an expert travel assistant responsible for extracting structured data from user requests.
    Your goal is to parse the user's text and output a JSON object that strictly adheres to the provided schema.

    CONTEXT:
    - Today's date is: {datetime.now().strftime('%Y-%m-%d')}
    - Time window definitions: Departures between 6am-12pm are "morning", 12pm-5pm are "afternoon", 5pm-9pm are "evening", and 9pm-6am are "night".

    INSTRUCTIONS:
    1.  Analyze the user's text to extract all relevant travel details.
    2.  Use the provided mappings to normalize values. For example, if the user says "zurich", you must use the IATA code "ZRH". If they say "business class", use "BUSINESS".
    3.  If a value is not mentioned in the text, omit it or set it to null in the JSON.
    4.  Infer `hotel_city` and `hotel_country` from the main destination.
    5.  The `hotel_prefs_text` field should contain the original, unmodified user text.
    6.  Parse dates accurately. "Next Tuesday" should be calculated relative to today's date.
    7.  Your output MUST be a valid JSON object matching the schema below.

    SCHEMA:
    {json.dumps(schema, indent=2)}

    USER TEXT:
    "{text}"
    """

    try:
        response = model.generate_content(prompt)
        json_data = json.loads(response.text)
        
        # Use Pydantic to validate the JSON and create the TripQuery object
        trip_query = TripQuery(**json_data)
        return trip_query

    except json.JSONDecodeError:
        print("Error: The model did not return valid JSON.")
        print("Model output:", response.text)
        return None
    except ValidationError as e:
        print(f"Error: Pydantic validation failed.\n{e}")
        print("Model output:", response.text)
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [12]:
hotels_index = hu.init_hotel_index("./chroma_storage", "hotel_information", "all-MiniLM-L6-v2")
hu.ingest_hotels(hotels_index, hotels_df)
parsed_flights = fu.parse_flights_df(flights_df, summary_col ="summary")

In [8]:
user_input = "Hi can you plan me a trip from Zurich to Rome from 10th of november 2025 to 24th november 2025, for 2 adults and 1 children, with an economy budget, evening flight and a preference for a hotel with 4 stars and near vatican"

In [17]:
trip = extractor(user_input)
print(trip)

origin='ZRH' destination='FCO' start_date=datetime.date(2025, 11, 10) end_date=datetime.date(2025, 11, 24) adults=2 children=1 cabin='ECONOMY' depart_window='evening' arrive_window=None return_window=None hotel_country='Italy' hotel_city='Rome' hotel_min_rating=4.0 hotel_prefs_text='a hotel with 4 stars and near vatican'


In [13]:
# Flights: pick the cheapest valid itinerary
best_flight, flight_cands = fu.pick_cheapest(
    parsed_flights,
    origin=trip.origin,
    destination=trip.destination,
    start_date=trip.start_date,
    end_date=trip.end_date,
    fare_type=trip.cabin,
    depart_window=trip.depart_window,
    arrive_window=trip.arrive_window,
    return_window=trip.return_window,
    return_top_n=3,   # keep a few for context
)

# Hotels: semantic prefs + filters
hotel_hits = hu.search_hotels(
    hotels_index,
    prefs_text=trip.hotel_prefs_text,
    top_n=5,
    country=trip.hotel_country,
    city=trip.hotel_city,
    min_rating=trip.hotel_min_rating or 4.0,
)

In [14]:
#gemini context formatting

def _dtfmt(ts): 
    return pd.to_datetime(ts).strftime("%d %b %Y %H:%M")

def flights_context(best, cands: pd.DataFrame)->str:
    if best is None or cands.empty:
        return "No flight options matched the filters."
    lines=[]
    for _, r in cands.iterrows():
        lines.append(
            f"* {r['airlines']} — {r['origin']} {_dtfmt(r['out_dep_dt'])} → {r['destination']} {_dtfmt(r['out_arr_dt'])} "
            f"|| Return {r['ret_origin']} {_dtfmt(r['ret_dep_dt'])} → {r['ret_destination']} {_dtfmt(r['ret_arr_dt'])} "
            f"| {r['currency']} {r['total_price']:.2f} | A{r['adults']} C{r['children']} {r['cabin'].upper()}"
        )
    return "\n".join(lines)

def hotels_context(hits)->str:
    if not hits:
        return "No hotel options matched the filters."
    lines=[]
    for h in hits:
        rating = f"{h.get('rating'):.0f}★" if h.get('rating') else (h.get('rating_raw') or "N/A")
        addr = h.get("address") or ""
        url  = h.get("website") or ""
        lines.append(
            f"* {h.get('name','N/A')} — {h.get('city','')}, {h.get('country','')}\n"
            f"  Rating: {rating}\n"
            f"  Address: {addr}\n"
            f"  Website: {url}"
        )
    return "\n".join(lines)

CTX = f"""[FLIGHTS]
{flights_context(best_flight, flight_cands)}

[HOTELS]
{hotels_context(hotel_hits)}
"""
print(CTX[:800])  # preview

#address website query fix

[FLIGHTS]
* Easyjet — ZRH 10 Nov 2025 21:55 → FCO 10 Nov 2025 23:30 || Return FCO 24 Nov 2025 06:00 → ZRH 24 Nov 2025 07:35 | EUR 278.03 | A2 C1 ECONOMY
* Swiss — ZRH 10 Nov 2025 17:50 → FCO 10 Nov 2025 19:25 || Return FCO 24 Nov 2025 20:15 → ZRH 24 Nov 2025 21:50 | EUR 298.37 | A2 C1 ECONOMY
* Easyjet — ZRH 10 Nov 2025 21:55 → FCO 10 Nov 2025 23:30 || Return FCO 24 Nov 2025 19:40 → ZRH 24 Nov 2025 21:20 | EUR 328.41 | A2 C1 ECONOMY

[HOTELS]
* Hotel Alimandi Vaticano — Rome, Italy
  Rating: 4★
  Address: Viale Vaticano 99 00165 Roma
  Website: http://www.alimandivaticanohotel.com
* Summit Roma Hotel — Rome, Italy
  Rating: 4★
  Address: VIA DELLA STAZIONE AURELIA, 99 ROME
  Website: WWW.SRH.IT
* Hotel della Conciliazione — Rome, Italy
  Rating: 4★
  Address: Via Borgo Pio 164 00193 Roma
 


In [15]:
SYSTEM_PROMPT = """You are a meticulous travel planning assistant.
Use only the context blocks provided for:
- Flight details
- Hotel options
Do not fabricate or assume details not present in the context.

When responding:
1) Extract key info (origin, destination, dates, travelers, budget/class preferences).
2) If anything is missing, ask the user for it.
3) Filter and pick the best options from the context according to the user's ask (cheapest flight).
4) Output EXACTLY in the required format."""

FORMAT_RULES = """
Output Format:

Flights (Round Trip)
* Outbound (<origin> → <destination>)
  <airline(s)> — <origin> <departure date/time> → <destination> <arrival date/time> | <currency> <price> | <passenger breakdown> <class>
* Return (<destination> → <origin>)
  <airline(s)> — <destination> <departure date/time> → <origin> <arrival date/time>

Hotel
* Hotel Name: <hotel name>
* Rating: <star rating>
* Address: <address>
* Website: <URL>

Formatting Rules:
- Passenger breakdown: A = Adult, C = Child (e.g., A2 C1).
- Write the full class type (e.g., “ECONOMY”).
- Dates/times format: “DD Mon YYYY HH:MM”.
"""

user_task = f"""User Message:
{user_input}

Context:
{CTX}

Follow the Output Format exactly. If any required field is missing in context, state what is missing and ask the user for it (instead of guessing)."""

# Attach the system prompt when you create the model
model = genai.GenerativeModel(
    model_name="gemini-2.5-flash",
    system_instruction=SYSTEM_PROMPT
)

# Generate a response using the model
resp = model.generate_content([FORMAT_RULES, user_task])

print(resp.text)

Flights (Round Trip)
* Outbound (ZRH → FCO)
  Easyjet — ZRH 10 Nov 2025 21:55 → FCO 10 Nov 2025 23:30 | EUR 278.03 | A2 C1 ECONOMY
* Return (FCO → ZRH)
  Easyjet — FCO 24 Nov 2025 06:00 → ZRH 24 Nov 2025 07:35

Hotel
* Hotel Name: Hotel Alimandi Vaticano
* Rating: 4★
* Address: Viale Vaticano 99 00165 Roma
* Website: http://www.alimandivaticanohotel.com
