In [16]:
from langchain.document_loaders import TextLoader

# Load your summary text
loader = TextLoader("data_explanation_section.txt", encoding="utf-8")
documents = loader.load()

# Preview sample content
print(documents[0].page_content[:500])



SECTION 1: DATA EXPLANATION

This dataset provides monthly-level flight performance data across U.S. airports and airlines. Each column gives insight into the volume, type, and causes of flight delays.

COLUMN EXPLANATIONS:

- year: Calendar year of the data. Helps track long-term trends.
- month: Month of the flight. Useful for seasonal pattern detection.
- carrier_name: The airline operating the flight. Key for comparing airline performance.
- airport_name: The destination airport. Important 


In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=100
)

docs = text_splitter.split_documents(documents)

# Check sample chunk
print(docs[0].page_content)


SECTION 1: DATA EXPLANATION


In [18]:
from langchain.embeddings import SentenceTransformerEmbeddings

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [19]:
from langchain.vectorstores import FAISS

vectorstore = FAISS.from_documents(docs, embedding_model)
#vectorstore.save_local("faiss_index")


In [20]:
from langchain.llms import LlamaCpp
import multiprocessing

llm = LlamaCpp(
    model_path="D:\\python\\models\\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\\tinyllama-1.1b-chat-q4_K_M.gguf",
    n_ctx=1024,
    max_tokens=128,
    temperature=0.6,
    top_p=0.9,
    n_batch=512,  # test 512 if possible
    n_threads=multiprocessing.cpu_count(),
    stop=["\n\n", "Question:"],
    verbose=False
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3},search_type="mmr")


llama_context: n_ctx_per_seq (1024) < n_ctx_train (2048) -- the full capacity of the model will not be utilized


In [21]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Create a prompt that forces the model to stay grounded
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are an AI assistant. Answer the question **only based on the context** below. Do not guess, do not invent follow-up questions.

Context:
{context}

Question:
{question}

Answer:
""")

# Create QA chain with custom prompt
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    chain_type="stuff",  # Stuff simply pastes the retrieved docs into the prompt
    return_source_documents=True,
    chain_type_kwargs={"prompt": custom_prompt}
)


In [23]:
query = "What does the late_aircraft_delay:  column mean?"
result = qa_chain({"query": query})

# Print only the result first
print("✅ Answer:\n", result["result"])

# Then, show details (source) after
print("\n🔍 Source Excerpt:\n", result["source_documents"][0].page_content[:300])


✅ Answer:
 The late_aircraft_delay: column represents the total number of delayed flights (i.e., flights that were delayed at least 15 minutes). This measure reflects the frequency of delays caused by airline issues.

🔍 Source Excerpt:
 - security_ct: Delays from security problems (e.g., threats). Rare but critical.
- late_aircraft_ct: Delays due to incoming late flights. Shows cascading delay effects.
