In [1]:
from main_workflow import *
import pandas as pd
from tqdm import tqdm
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bs = pd.read_csv('baseline_responses.csv')
gc = pd.read_csv('golden_copy.csv')

In [3]:
import re

In [64]:

def extract_fields(text):
    """Extract critical fields from model or ground truth response text."""
    text = ' '.join(text.split())

    # patterns
    date_pattern = r"(?:\d{4}-\d{2}-\d{2}|\d{1,2}\s+\w{3,9}\s+\d{4})\s+\d{2}:\d{2}"
    hotel_pattern = r'Hotel(?: Name)?:\s*([A-Za-z0-9\s\-\&\,\.]+)'

    # outbound airline
    outbound_match = re.search(
        r"Outbound\s*\([^)]*\)\*?\s*([\w\s,&\-\.\']+?)\s*—",
        text, re.IGNORECASE | re.DOTALL
    )
    outbound_airline = outbound_match.group(1).strip() if outbound_match else None
    if outbound_airline:
        outbound_airline_set = set(i.lower().strip() for i in outbound_airline.split(','))
    else:
        outbound_airline_set = set()

    # inbound airline
    inbound_match = re.search(
        r"Return\s*\([^)]*\)\*?\s*([\w\s,&\-\.\']+?)\s*—",
        text, re.IGNORECASE | re.DOTALL
    )
    inbound_airline = inbound_match.group(1).strip() if inbound_match else None
    if inbound_airline:
        inbound_airline_set = set(i.lower().strip() for i in inbound_airline.split(','))
    else:
        inbound_airline_set = set()

    # datetimes for flights, first date is departure, third date is arrival
    datetimes = re.findall(date_pattern, text)
    dep_datetime = (datetimes[0], datetimes[1])
    arr_datetime = (datetimes[2], datetimes[3])

    # hotel name
    hotel_match = re.search(hotel_pattern, text, re.IGNORECASE)
    hotel_name = hotel_match.group(1).strip() if hotel_match else None

    return {
        'Outbound Airline': outbound_airline_set,
        'Departure Datetime': dep_datetime,
        'Inbound Airline': inbound_airline_set,
        'Arrival Datetime': arr_datetime,
        'Hotel Name': hotel_name
    }

In [65]:
def calculate_score(fields_1, fields_2):
    score = 0
    for field, value in fields_1.items():
        if value == fields_2[field]:
            score += 1
    return score

In [45]:
merged_df = pd.merge(bs, gc, on='user_query', how='outer')

In [46]:
df = merged_df[['user_query', 'response', 'golden_response']]

In [68]:
total = 0

for _, row in df.iterrows():
    try:
        fields1 = extract_fields(row['response'])
        fields2 = extract_fields(row['golden_response'])
    except Exception:
        print(row['response'])
        print(row['golden_response'])
        print()
    total += calculate_score(fields1, fields2)

Flights (Round Trip)
* Outbound (Frankfurt → Rome)
  No flight options were found for your specified dates and preferences. Please provide alternative dates or preferences, or we can look for flights with more flexible options.
* Return (Rome → Frankfurt)
  No flight options were found for your specified dates and preferences. Please provide alternative dates or preferences, or we can look for flights with more flexible options.

Hotel
* Hotel Name: In Rome
* Rating: 3★
* Address: Viale Manzoni 13, 13 ROME
* Website: http://www.inrome.com/in-rome-bed-and-breakfasts.htm
* Description: This lovely hotel is set in San Giovanni and offers a quiet stay with just 5 units. Pets are not allowed.
* Facilities: Hotel

Suggested Itinerary:
As flight details are currently unavailable, this itinerary is general. Once flights are booked, it can be refined.
*   **10 Nov 2025:** Arrive in Rome (FCO) from Frankfurt. Check into "In Rome" in the San Giovanni district. Spend the afternoon exploring the lo

In [58]:
test = """Flights (Round Trip)
Outbound (SIN → LAX)*
Cathay Pacific Airways — SIN 2025-11-10 18:00 → LAX 2025-11-10 20:55 | SGD 3,152.53 | A3 C1 ECONOMY
Return (LAX → SIN)*
Cathay Pacific Airways — LAX 2025-11-24 22:35 → SIN 2025-11-26 11:55
Hotel
Hotel Name: Shutters on the Beach
Address: 1 Pico Blvd, Santa Monica, CA 90405, USA
Website: http://www.shuttersonthebeach.com"""

In [61]:
extract_fields(test)

{'Outbound Airline': {'cathay pacific airways'},
 'Departure Datetime': ('2025-11-10 18:00', '2025-11-10 20:55'),
 'Inbound Airline': {'cathay pacific airways'},
 'Arrival Datetime': ('2025-11-24 22:35', '2025-11-26 11:55'),
 'Hotel Name': 'Shutters on the Beach Address'}

In [None]:
results = []

for idx, row in tqdm(gc.iterrows(), total = len(gc)):
    query = row['user_query']
    res = run_travel_bot(query)
    results.append([query, res])

baseline_responses = pd.DataFrame(results, columns=['user_query', 'response'])

In [136]:
bs = pd.read_csv('baseline_responses.csv')
gc = pd.read_csv('golden_copy.csv')

In [137]:
merged_df = pd.merge(bs, gc, on='user_query', how='outer')


In [150]:
import evaluate
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /Users/nltyh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [151]:
df = merged_df[['user_query', 'response', 'golden_response']]

In [152]:
bertscore = evaluate.load('bertscore')
meteor = evaluate.load('meteor')

[nltk_data] Downloading package wordnet to /Users/nltyh/nltk_data...
[nltk_data] Downloading package punkt_tab to /Users/nltyh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nltyh/nltk_data...


In [170]:
bertscore_df, avg_score = evaluator.bertEval()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [171]:
meteor_df, avg_meteor = evaluator.meteorEval()

In [173]:
avg_meteor

np.float64(0.425310123487645)

In [174]:
avg_score

np.float64(0.8550686163780017)