In [2]:
import sys
import os
from neo4j import GraphDatabase


sys.path.append(os.path.abspath("."))


def load_config():
    cfg = {}
    with open("config.txt", "r") as f:
        for line in f:
            if "=" in line:
                k, v = line.strip().split("=")
                cfg[k] = v
    return cfg

config = load_config()

URI = config["URI"]
USERNAME = config["USERNAME"]
PASSWORD = config["PASSWORD"]

In [3]:
import re
from enum import Enum
from typing import List, Dict, Optional, Any
from pydantic import BaseModel, Field
from dateutil import parser as dateparser


In [4]:
import os
import time
from langchain_huggingface import ChatHuggingFace
from langchain_core.messages import HumanMessage
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline
from huggingface_hub import InferenceClient
from langchain_huggingface import HuggingFaceEndpoint
os.environ["HF_TOKEN"] = "add token id"


In [5]:
class Intent(str, Enum):
    FLIGHT_STATUS = "flight_status"
    FLIGHT_ROUTE = "flight_route"
    JOURNEY_FEEDBACK = "journey_feedback"
    DELAY_ANALYSIS = "delay_analysis"
    PASSENGER_HISTORY = "passenger_history"
    AIRPORT_TRAFFIC = "airport_traffic"
    GENERAL_QA = "general_qa"


class ExtractedEntity(BaseModel):
    type: str
    value: str
    normalized: Optional[str] = None
    confidence: float = 1.0


class PreprocessResult(BaseModel):
    original_text: str
    normalized_text: str
    intent: Intent
    entities: List[ExtractedEntity]
    slots: Dict[str, Any]
    debug: Dict[str, Any] = Field(default_factory=dict)


In [6]:
def normalize_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text


In [7]:
INTENT_RULES = {
    Intent.FLIGHT_STATUS: ["status", "arrival", "departure", "on time", "delayed"],
    Intent.FLIGHT_ROUTE: ["from", "to", "route", "flies"],
    Intent.JOURNEY_FEEDBACK: ["food", "satisfaction", "feedback", "experience"],
    Intent.DELAY_ANALYSIS: ["delay", "late", "arrival delay"],
    Intent.PASSENGER_HISTORY: ["passenger", "history", "journeys", "flights taken"],
    Intent.AIRPORT_TRAFFIC: ["airport", "traffic", "busy"],
}


def classify_intent(normalized_text: str):
    scores = {intent: 0 for intent in INTENT_RULES}

    for intent, keywords in INTENT_RULES.items():
        for kw in keywords:
            if kw in normalized_text:
                scores[intent] += 1

    best_intent = max(scores, key=scores.get)
    if scores[best_intent] == 0:
        best_intent = Intent.GENERAL_QA

    return best_intent, scores


In [8]:
FLIGHT_NUMBER = re.compile(r"\b([A-Z]{2}\d{1,4})\b", re.IGNORECASE)
RECORD_LOCATOR = re.compile(r"\b[A-Z0-9]{6}\b")
AIRPORT_CODE = re.compile(r"\b[A-Z]{3}\b")

METRIC_KEYWORDS = {
    "arrival_delay_minutes": ["delay", "late"],
    "food_satisfaction_score": ["food", "meal", "satisfaction"],
    "actual_flown_miles": ["miles", "distance"],
    "number_of_legs": ["legs", "connections"],
    "passenger_class": ["economy", "business", "first"]
}


In [9]:
def extract_entities(text: str) -> List[ExtractedEntity]:
    entities = []

    # Flight number
    for m in FLIGHT_NUMBER.finditer(text.upper()):
        entities.append(
            ExtractedEntity(
                type="FLIGHT_NUMBER",
                value=m.group(1),
                normalized=m.group(1),
                confidence=1.0
            )
        )

    # Record locator (Passenger)
    if any(k in text for k in ["booking", "record", "locator", "passenger"]):
        for m in RECORD_LOCATOR.finditer(text.upper()):
            entities.append(
                ExtractedEntity(
                    type="RECORD_LOCATOR",
                    value=m.group(0),
                    normalized=m.group(0),
                    confidence=0.95
                )
            )

    # Airport codes
    for m in AIRPORT_CODE.finditer(text.upper()):
        entities.append(
            ExtractedEntity(
                type="AIRPORT_CODE",
                value=m.group(0),
                normalized=m.group(0),
                confidence=1.0
            )
        )

    # Journey metrics
    for metric, kws in METRIC_KEYWORDS.items():
        for kw in kws:
            if kw in text:
                entities.append(
                    ExtractedEntity(
                        type="JOURNEY_METRIC",
                        value=kw,
                        normalized=metric,
                        confidence=0.9
                    )
                )

    return entities


In [10]:
def build_slots(entities: List[ExtractedEntity]) -> Dict[str, Any]:
    slots = {}

    airports = []

    for e in entities:
        if e.type == "RECORD_LOCATOR":
            slots["record_locator"] = e.normalized

        elif e.type == "FLIGHT_NUMBER":
            slots["flight_number"] = e.normalized

        elif e.type == "AIRPORT_CODE":
            airports.append(e.normalized)

        elif e.type == "JOURNEY_METRIC":
            slots.setdefault("metrics", []).append(e.normalized)

    if len(airports) >= 2:
        slots["origin_station"] = airports[0]
        slots["destination_station"] = airports[1]

    return slots


In [11]:
def preprocess_query(user_text: str) -> PreprocessResult:
    normalized = normalize_text(user_text)

    intent, intent_scores = classify_intent(normalized)
    entities = extract_entities(normalized)
    slots = build_slots(entities)

    return PreprocessResult(
        original_text=user_text,
        normalized_text=normalized,
        intent=intent,
        entities=entities,
        slots=slots,
        debug={
            "intent_scores": intent_scores
        }
    )


In [12]:
def route_to_component2(preprocess_result, graph_retriever):
    intent = preprocess_result.intent
    slots = preprocess_result.slots

    if intent == Intent.FLIGHT_ROUTE:
        return graph_retriever.retrieve(
            "flights_between",
            {
                "origin": slots.get("origin_station"),
                "destination": slots.get("destination_station")
            }
        )

    if intent == Intent.FLIGHT_STATUS:
        return graph_retriever.retrieve(
            "journey_flight",
            {
                "flight_number": slots.get("flight_number")
            }
        )

    if intent == Intent.JOURNEY_FEEDBACK:
        return graph_retriever.retrieve(
            "food_scores_by_passenger",
            {
                "record_locator": slots.get("record_locator")
            }
        )

    if intent == Intent.PASSENGER_HISTORY:
        return graph_retriever.retrieve(
            "passenger_journeys",
            {
                "record_locator": slots.get("record_locator")
            }
        )

    if intent == Intent.DELAY_ANALYSIS:
        return graph_retriever.retrieve(
            "long_flights",
            {
                "min_miles": 2000
            }
        )

    return {"message": "Intent recognized, but no KG query mapped."}


In [13]:
user_text = "Show me flights from LAX to IAX"
preprocess_result = preprocess_query(user_text)

print("Intent:", preprocess_result.intent)
print("Slots:", preprocess_result.slots)
print("Entities:", preprocess_result.entities)


Intent: Intent.FLIGHT_ROUTE
Slots: {'origin_station': 'LAX', 'destination_station': 'IAX'}
Entities: [ExtractedEntity(type='AIRPORT_CODE', value='LAX', normalized='LAX', confidence=1.0), ExtractedEntity(type='AIRPORT_CODE', value='IAX', normalized='IAX', confidence=1.0)]


In [14]:
from graph_retriever import GraphRetriever


In [15]:
gr = GraphRetriever(URI, USERNAME, PASSWORD)

print("GraphRetriever initialized.")


GraphRetriever initialized.


In [16]:
kg_results = route_to_component2(preprocess_result, gr)
print("KG Results:", kg_results)


KG Results: [{'flight': '500'}, {'flight': '1414'}, {'flight': '1653'}, {'flight': '2385'}, {'flight': '2607'}, {'flight': '1715'}, {'flight': '388'}, {'flight': '1735'}, {'flight': '2300'}, {'flight': '2391'}, {'flight': '1636'}, {'flight': '784'}, {'flight': '297'}, {'flight': '1962'}, {'flight': '1681'}, {'flight': '2391'}, {'flight': '1414'}, {'flight': '1182'}, {'flight': '2281'}, {'flight': '2411'}, {'flight': '2608'}]


In [17]:
print("Training Node2Vec embeddings...")
output = gr.build_embeddings(method="node2vec")
output

Training Node2Vec embeddings...


Computing transition probabilities:   0%|          | 0/7944 [00:00<?, ?it/s]

Generating walks (CPU: 2): 100%|██████████| 25/25 [00:00<00:00, 35.00it/s]
Generating walks (CPU: 1): 100%|██████████| 25/25 [00:00<00:00, 34.00it/s]


"Embedding model 'node2vec_embed' trained and indexed."

In [18]:
print("Training GraphSAGE embeddings...")
output = gr.build_embeddings(method="graphsage")
output


Training GraphSAGE embeddings...


"Embedding model 'sage_embed' trained and indexed."

In [19]:
with gr.driver.session() as session:
    eid = session.run("""
        MATCH (p:Passenger)
        RETURN elementId(p) AS eid
        LIMIT 1
    """).single()["eid"]

print("Node2Vec similarity:")
node2vec_results = gr.retrieve(
    "similar_nodes",
    {
        "embedding_name": "node2vec_embed",
        "label": "Passenger",
        "node_eid": eid,
        "k": 5
    }
)
node2vec_results

Node2Vec similarity:


[{'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:0',
  'score': 0.9999275207519531},
 {'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:5239',
  'score': 0.9907218217849731},
 {'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:6619',
  'score': 0.9873120784759521},
 {'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:3139',
  'score': 0.9865827560424805},
 {'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:7099',
  'score': 0.9863818883895874}]

In [20]:
print("GraphSAGE similarity:")
sage_results = gr.retrieve(
    "similar_nodes",
    {
        "embedding_name": "sage_embed",
        "label": "Passenger",
        "node_eid": eid,
        "k": 5
    }
)

sage_results

GraphSAGE similarity:


[{'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:8188', 'score': 1.0},
 {'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:8486', 'score': 1.0},
 {'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:7065', 'score': 1.0},
 {'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:8489', 'score': 1.0},
 {'id': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:7122', 'score': 1.0}]

In [21]:
def merge_retrieval_results(baseline_results, embedding_results):
    merged = {}

    for item in baseline_results:
        flight_id = item.get("flight")
        merged[flight_id] = {
            "flight": flight_id,
            "origin": item.get("origin"),
            "similarity": None,
            "source": ["cypher"]
        }

    for item in embedding_results:
        flight_id = item.get("id")
        if flight_id in merged:
            merged[flight_id]["similarity"] = item["score"]
            merged[flight_id]["source"].append("embedding")

    return list(merged.values())


In [22]:
def build_context(baseline, embeddings):
    context = "Retrieved airline operational insights:\n\n"

    # Baseline (KG) results
    for f in baseline:
        flight = f.get("flight", "Unknown")
        origin = f.get("origin")

        if origin:
            context += f"Flight {flight} departing from {origin}.\n"
        else:
            context += f"Flight {flight}.\n"

    # Embedding results
    context += "\nSimilar nodes based on graph embeddings:\n"

    for e in embeddings:
        node_id = e.get("id", "Unknown")
        score = round(e.get("score", 0), 3)
        context += f"Node {node_id} with similarity score {score}.\n"

    return context


In [23]:
baseline_results = kg_results
embedding_results = node2vec_results

merged = merge_retrieval_results(baseline_results, embedding_results)
kg_context = build_context(baseline_results, embedding_results)

print(kg_context)


Retrieved airline operational insights:

Flight 500.
Flight 1414.
Flight 1653.
Flight 2385.
Flight 2607.
Flight 1715.
Flight 388.
Flight 1735.
Flight 2300.
Flight 2391.
Flight 1636.
Flight 784.
Flight 297.
Flight 1962.
Flight 1681.
Flight 2391.
Flight 1414.
Flight 1182.
Flight 2281.
Flight 2411.
Flight 2608.

Similar nodes based on graph embeddings:
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:0 with similarity score 1.0.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:5239 with similarity score 0.991.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:6619 with similarity score 0.987.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:3139 with similarity score 0.987.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:7099 with similarity score 0.986.



In [30]:
prompt = build_prompt(kg_context, user_question)


In [31]:
def build_prompt(context, question):
    return f"""
Persona:
You are an airline operations insight assistant.
You help airline companies analyze flight delays,
routes performance, and passenger satisfaction.

Context:
The following information was retrieved from the airline knowledge graph.
Use ONLY this information. Do NOT use external knowledge.

{context}

Task:
Provide clear operational insights and recommendations
based only on the context above.
If the information is insufficient, say so clearly.

User Question:
{question}
"""
user_question = "Which route has the highest delays and what should the airline improve?"
prompt = build_prompt(kg_context, user_question)
print(prompt)




Persona:
You are an airline operations insight assistant.
You help airline companies analyze flight delays,
routes performance, and passenger satisfaction.

Context:
The following information was retrieved from the airline knowledge graph.
Use ONLY this information. Do NOT use external knowledge.

Retrieved airline operational insights:

Flight 500.
Flight 1414.
Flight 1653.
Flight 2385.
Flight 2607.
Flight 1715.
Flight 388.
Flight 1735.
Flight 2300.
Flight 2391.
Flight 1636.
Flight 784.
Flight 297.
Flight 1962.
Flight 1681.
Flight 2391.
Flight 1414.
Flight 1182.
Flight 2281.
Flight 2411.
Flight 2608.

Similar nodes based on graph embeddings:
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:0 with similarity score 1.0.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:5239 with similarity score 0.991.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:6619 with similarity score 0.987.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:3139 with similarity score 0.987.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c

In [32]:
def load_chat_model(repo_id):
    pipe = pipeline(
        "text-generation",
        model=repo_id,
        max_new_tokens=300,
        temperature=0.2
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    return llm


In [33]:
MODELS = {
    "Mistral": "mistralai/Mistral-7B-Instruct-v0.2",
    "Zephyr": "HuggingFaceH4/zephyr-7b-beta",
    "Llama2": "meta-llama/Llama-2-7b-chat-hf"
}


In [34]:
def run_all_models(prompt):
    results = {}

    for name, repo in MODELS.items():
        print(f"Running {name}...")
        llm = load_chat_model(repo)

        start = time.time()
        response = llm.invoke(prompt)
        elapsed = time.time() - start

        results[name] = {
            "response": response,
            "time_seconds": round(elapsed, 2)
        }

    return results

In [None]:
outputs = run_all_models(prompt)
outputs


Running Mistral...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use mps:0
