In [2]:
!pip install pydantic python-dateutil




In [3]:
import re
from enum import Enum
from typing import List, Dict, Optional, Any
from pydantic import BaseModel, Field
from dateutil import parser as dateparser


In [4]:
class Intent(str, Enum):
    FLIGHT_STATUS = "flight_status"
    FLIGHT_ROUTE = "flight_route"
    JOURNEY_FEEDBACK = "journey_feedback"
    DELAY_ANALYSIS = "delay_analysis"
    PASSENGER_HISTORY = "passenger_history"
    AIRPORT_TRAFFIC = "airport_traffic"
    GENERAL_QA = "general_qa"


class ExtractedEntity(BaseModel):
    type: str
    value: str
    normalized: Optional[str] = None
    confidence: float = 1.0


class PreprocessResult(BaseModel):
    original_text: str
    normalized_text: str
    intent: Intent
    entities: List[ExtractedEntity]
    slots: Dict[str, Any]
    debug: Dict[str, Any] = Field(default_factory=dict)


In [5]:
def normalize_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text


In [6]:
INTENT_RULES = {
    Intent.FLIGHT_STATUS: ["status", "arrival", "departure", "on time", "delayed"],
    Intent.FLIGHT_ROUTE: ["from", "to", "route", "flies"],
    Intent.JOURNEY_FEEDBACK: ["food", "satisfaction", "feedback", "experience"],
    Intent.DELAY_ANALYSIS: ["delay", "late", "arrival delay"],
    Intent.PASSENGER_HISTORY: ["passenger", "history", "journeys", "flights taken"],
    Intent.AIRPORT_TRAFFIC: ["airport", "traffic", "busy"],
}


def classify_intent(normalized_text: str):
    scores = {intent: 0 for intent in INTENT_RULES}

    for intent, keywords in INTENT_RULES.items():
        for kw in keywords:
            if kw in normalized_text:
                scores[intent] += 1

    best_intent = max(scores, key=scores.get)
    if scores[best_intent] == 0:
        best_intent = Intent.GENERAL_QA

    return best_intent, scores


In [7]:
FLIGHT_NUMBER = re.compile(r"\b([A-Z]{2}\d{1,4})\b", re.IGNORECASE)
RECORD_LOCATOR = re.compile(r"\b[A-Z0-9]{6}\b")
AIRPORT_CODE = re.compile(r"\b[A-Z]{3}\b")

METRIC_KEYWORDS = {
    "arrival_delay_minutes": ["delay", "late"],
    "food_satisfaction_score": ["food", "meal", "satisfaction"],
    "actual_flown_miles": ["miles", "distance"],
    "number_of_legs": ["legs", "connections"],
    "passenger_class": ["economy", "business", "first"]
}


In [8]:
def extract_entities(text: str) -> List[ExtractedEntity]:
    entities = []

    # Flight number
    for m in FLIGHT_NUMBER.finditer(text.upper()):
        entities.append(
            ExtractedEntity(
                type="FLIGHT_NUMBER",
                value=m.group(1),
                normalized=m.group(1),
                confidence=1.0
            )
        )

    # Record locator (Passenger)
    if any(k in text for k in ["booking", "record", "locator", "passenger"]):
        for m in RECORD_LOCATOR.finditer(text.upper()):
            entities.append(
                ExtractedEntity(
                    type="RECORD_LOCATOR",
                    value=m.group(0),
                    normalized=m.group(0),
                    confidence=0.95
                )
            )

    # Airport codes
    for m in AIRPORT_CODE.finditer(text.upper()):
        entities.append(
            ExtractedEntity(
                type="AIRPORT_CODE",
                value=m.group(0),
                normalized=m.group(0),
                confidence=1.0
            )
        )

    # Journey metrics
    for metric, kws in METRIC_KEYWORDS.items():
        for kw in kws:
            if kw in text:
                entities.append(
                    ExtractedEntity(
                        type="JOURNEY_METRIC",
                        value=kw,
                        normalized=metric,
                        confidence=0.9
                    )
                )

    return entities


In [9]:
def build_slots(entities: List[ExtractedEntity]) -> Dict[str, Any]:
    slots = {}

    airports = []

    for e in entities:
        if e.type == "RECORD_LOCATOR":
            slots["record_locator"] = e.normalized

        elif e.type == "FLIGHT_NUMBER":
            slots["flight_number"] = e.normalized

        elif e.type == "AIRPORT_CODE":
            airports.append(e.normalized)

        elif e.type == "JOURNEY_METRIC":
            slots.setdefault("metrics", []).append(e.normalized)

    if len(airports) >= 2:
        slots["origin_station"] = airports[0]
        slots["destination_station"] = airports[1]

    return slots


In [10]:
def preprocess_query(user_text: str) -> PreprocessResult:
    normalized = normalize_text(user_text)

    intent, intent_scores = classify_intent(normalized)
    entities = extract_entities(normalized)
    slots = build_slots(entities)

    return PreprocessResult(
        original_text=user_text,
        normalized_text=normalized,
        intent=intent,
        entities=entities,
        slots=slots,
        debug={
            "intent_scores": intent_scores
        }
    )
