In [1]:
from graph_retriever import GraphRetriever
import os
import time
from langchain_huggingface import ChatHuggingFace
from langchain_core.messages import HumanMessage
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline
from huggingface_hub import InferenceClient
from langchain_huggingface import HuggingFaceEndpoint
os.environ["HF_TOKEN"] = "add token id"


In [3]:
def load_config():
    cfg = {}
    with open("config.txt", "r") as f:
        for line in f:
            if "=" in line:
                k, v = line.strip().split("=")
                cfg[k] = v
    return cfg

config = load_config()

URI = config["URI"]
USERNAME = config["USERNAME"]
PASSWORD = config["PASSWORD"]

print("Config loaded")


Config loaded


In [4]:
gr = GraphRetriever(URI, USERNAME, PASSWORD)


In [21]:

def get_route_delay_baseline(gr, origin, destination):
    return gr.retrieve(
        "route_delay",
        {
            "origin": origin,
            "destination": destination
        }
    )

def get_route_embedding_results(gr, flight_eid, k=5):
    return gr.retrieve(
        "similar_nodes",
        {
            "embedding_name": "node2vec_embed",
            "label": "Flight",
            "node_eid": flight_eid,
            "k": k
        }
    )


In [23]:
def get_flight_eid(gr, flight_number):
    with gr.driver.session() as session:
        result = session.run(
            """
            MATCH (f:Flight {flight_number:$fn})
            RETURN elementId(f) AS eid
            """,
            fn=flight_number
        ).data()

    if not result:
        raise ValueError("Flight not found in Neo4j")

    return result[0]["eid"]


In [24]:
baseline_results = gr.retrieve(
    "flights_from",
    {"origin": "BGX"}
)


if not baseline_results:
    raise ValueError("No flights found for this route")


flight_number = baseline_results[0]["flight"]

flight_eid = get_flight_eid(gr, flight_number)

embedding_results = get_route_embedding_results(
    gr,
    flight_eid,
    k=5
)


In [25]:
print(type(baseline_results))
print(baseline_results)


<class 'list'>
[{'flight': '272', 'origin': 'BGX'}]


In [27]:
flight_number = baseline_results[0]["flight"]
flight_eid = get_flight_eid(gr, flight_number)
embedding_results = gr.retrieve(
    "similar_nodes",
    {
        "embedding_name": "node2vec_embed",
        "label": "Flight",
        "node_eid": flight_eid,
        "k": 5
    }
)
embedding_results = [
    {"flight": r["id"], "similarity": r["score"]}
    for r in embedding_results
]


In [31]:
merged_results = merge_retrieval_results(
    baseline_results,
    embedding_results
)

merged_results


[{'flight': '272',
  'route': None,
  'avg_delay': None,
  'rating': None,
  'similarity': None,
  'source': ['cypher']},
 {'flight': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:12',
  'route': None,
  'avg_delay': None,
  'rating': None,
  'similarity': 0.9996699094772339,
  'source': ['embedding']},
 {'flight': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:6236',
  'route': None,
  'avg_delay': None,
  'rating': None,
  'similarity': 0.8270696401596069,
  'source': ['embedding']},
 {'flight': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:8152',
  'route': None,
  'avg_delay': None,
  'rating': None,
  'similarity': 0.8151417970657349,
  'source': ['embedding']},
 {'flight': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:170',
  'route': None,
  'avg_delay': None,
  'rating': None,
  'similarity': 0.8136136531829834,
  'source': ['embedding']},
 {'flight': '4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:504',
  'route': None,
  'avg_delay': None,
  'rating': None,
  'similarity': 0.8081240653991699,
  'source': ['e

In [29]:
def merge_retrieval_results(baseline_results, embedding_results):
    merged = {}

    # Handle baseline (Cypher / KG) results
    for item in baseline_results:
        flight_id = (
            item.get("flight")
            or item.get("flight_id")
            or item.get("id")
        )

        if flight_id is None:
            continue

        merged[flight_id] = {
            "flight": flight_id,
            "route": item.get("route"),
            "avg_delay": item.get("avg_delay_minutes") or item.get("avg_delay"),
            "rating": item.get("passenger_rating") or item.get("rating"),
            "similarity": None,
            "source": ["cypher"]
        }

    # Handle embedding-based results
    for item in embedding_results:
        flight_id = (
            item.get("flight")
            or item.get("flight_id")
            or item.get("id")
        )

        if flight_id is None:
            continue

        if flight_id in merged:
            merged[flight_id]["similarity"] = (
                item.get("similarity") or item.get("score")
            )
            merged[flight_id]["source"].append("embedding")
        else:
            merged[flight_id] = {
                "flight": flight_id,
                "route": None,
                "avg_delay": None,
                "rating": None,
                "similarity": item.get("similarity") or item.get("score"),
                "source": ["embedding"]
            }

    return list(merged.values())


In [48]:
def build_context(baseline, embeddings):
    context = "Retrieved airline operational insights:\n\n"

    for f in baseline:
        context += (
            f"Flight {f['flight']} departing from {f['origin']}.\n"
        )

    context += "\nSimilar nodes based on graph embeddings:\n"

    for e in embeddings:
        node = (
            e.get("flight")
            or e.get("id")
            or e.get("node")
            or "Unknown"
        )

        score = e.get("similarity") or e.get("score") or 0

        context += (
            f"Node {node} with similarity score {round(score, 3)}.\n"
        )

    return context


In [49]:
kg_context = build_context(baseline_results, embedding_results)
print(kg_context)


Retrieved airline operational insights:

Flight 272 departing from BGX.

Similar nodes based on graph embeddings:
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:12 with similarity score 1.0.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:6236 with similarity score 0.827.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:8152 with similarity score 0.815.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:170 with similarity score 0.814.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:504 with similarity score 0.808.



In [50]:
prompt = build_prompt(kg_context, user_question)


In [51]:
def build_prompt(context, question):
    return f"""
Persona:
You are an airline operations insight assistant.
You help airline companies analyze flight delays,
routes performance, and passenger satisfaction.

Context:
The following information was retrieved from the airline knowledge graph.
Use ONLY this information. Do NOT use external knowledge.

{context}

Task:
Provide clear operational insights and recommendations
based only on the context above.
If the information is insufficient, say so clearly.

User Question:
{question}
"""
user_question = "Which route has the highest delays and what should the airline improve?"
prompt = build_prompt(kg_context, user_question)
print(prompt)




Persona:
You are an airline operations insight assistant.
You help airline companies analyze flight delays,
routes performance, and passenger satisfaction.

Context:
The following information was retrieved from the airline knowledge graph.
Use ONLY this information. Do NOT use external knowledge.

Retrieved airline operational insights:

Flight 272 departing from BGX.

Similar nodes based on graph embeddings:
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:12 with similarity score 1.0.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:6236 with similarity score 0.827.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:8152 with similarity score 0.815.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:170 with similarity score 0.814.
Node 4:5db1414e-a7ed-4877-a4a1-3456bfe83c2f:504 with similarity score 0.808.


Task:
Provide clear operational insights and recommendations
based only on the context above.
If the information is insufficient, say so clearly.

User Question:
Which route has the highest delays and

In [52]:
def load_chat_model(repo_id):
    pipe = pipeline(
        "text-generation",
        model=repo_id,
        max_new_tokens=300,
        temperature=0.2
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    return llm


In [53]:
MODELS = {
    "Mistral": "mistralai/Mistral-7B-Instruct-v0.2",
    "Zephyr": "HuggingFaceH4/zephyr-7b-beta",
    "Llama2": "meta-llama/Llama-2-7b-chat-hf"
}


In [54]:
def run_all_models(prompt):
    results = {}

    for name, repo in MODELS.items():
        print(f"Running {name}...")
        llm = load_chat_model(repo)

        start = time.time()
        response = llm.invoke(prompt)
        elapsed = time.time() - start

        results[name] = {
            "response": response,
            "time_seconds": round(elapsed, 2)
        }

    return results

In [None]:
outputs = run_all_models(prompt)
outputs


Running Mistral...


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Device set to use mps:0
