In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import expr

# -----------------------
# Spark Session
# -----------------------
spark = SparkSession.builder.appName("N2sql").getOrCreate()

# -----------------------
# CONFIG
# -----------------------
TIME_TABLE = "default.adobe.test_data"  # replace with your actual table name
BLOCKLIST = ["INSERT","UPDATE","DELETE","DROP","ALTER","CREATE","TRUNCATE","MERGE"]

# -----------------------
# Schema Introspection
# -----------------------
def build_schema_text(table_name: str) -> str:
    cols = spark.catalog.listColumns(table_name)
    col_str = ", ".join([f"{c.name}:{c.dataType}" for c in cols])
    return f"{table_name}({col_str})"

SCHEMA_TEXT = build_schema_text(TIME_TABLE)
print("Schema:\n", SCHEMA_TEXT)

# -----------------------
# LLM SQL Generator
# -----------------------
def llm_generate_sql(user_question: str, schema_text: str) -> str:
    prompt = f"""
You are a Spark SQL generator.

Schema: {schema_text}

Rules:
- Use only this table: {TIME_TABLE}.
- SELECT only, no DML/DDL.
- Always include LIMIT 100 at the end.
- If you use aggregation, include GROUP BY.
- Output must be a single-line SQL query (no ``` or extra text).

User question: {user_question}
"""
    df_prompt = spark.createDataFrame([Row(prompt=prompt)])
    out_df = df_prompt.select(
        expr("query_model('default.oci_ai_models.xai.grok-4', prompt) as sql_text")
    )
    sql = out_df.collect()[0]["sql_text"]
    return sql.strip()

# -----------------------
# SQL Sanitizer
# -----------------------
def sanitize_sql(sql: str) -> str:
    if not sql.lower().startswith("select"):
        raise ValueError("Only SELECT queries allowed")
    for bad in BLOCKLIST:
        if bad in sql.upper():
            raise ValueError(f"Disallowed token: {bad}")
    if "limit" not in sql.lower():
        sql = sql.rstrip(";") + " LIMIT 100"
    return sql

# -----------------------
# Run SQL
# -----------------------
def run_sql(sql: str):
    print("\nRunning SQL:\n", sql)
    df = spark.sql(sql)
    df.show(20, truncate=False)
    return df

# -----------------------
# LLM Result Summarizer
# -----------------------
def llm_summarize_dataframe(df, max_rows=50):
    pdf = df.limit(max_rows).toPandas()
    table_text = pdf.to_csv(index=False)
    prompt = f"""
You are a data analyst. Summarize the following result table in 5 bullet points.
Highlight trends, top/bottom values, anomalies, and business meaning.

CSV:
{table_text}
"""
    df_prompt = spark.createDataFrame([Row(prompt=prompt)])
    out_df = df_prompt.select(
        expr("query_model('default.oci_ai_models.xai.grok-4', prompt) as summary")
    )
    return out_df.collect()[0]["summary"]

# -----------------------
# Main Agent Function
# -----------------------
def ask_time_table(question: str):
    raw_sql = llm_generate_sql(question, SCHEMA_TEXT)
    print("\nLLM Raw SQL:\n", raw_sql)

    safe_sql = sanitize_sql(raw_sql)
    df = run_sql(safe_sql)

    try:
        summary = llm_summarize_dataframe(df)
        print("\nLLM Analysis:\n", summary)
    except Exception as e:
        print("Summarization failed:", e)

    return df

# -----------------------
# Example Run
# -----------------------
if __name__ == "__main__":
    q = "Show the top 5 Item_Key by Revenue_at_Risk_USD"
    ask_time_table(q)


Schema:
 default.adobe.test_data(supplier_key:int, item_key:int, time_key:int, current_contract_price_usd:double, months_to_contract_expiry:int, current_avg_price_usd:double, annual_spend_usd:double, internal_min_price_usd:double, internal_max_price_usd:double, best_historical_bid_usd:double, price_gap_vs_bestbid:double, current_payment_terms:string, avg_days_to_pay:int, forecasted_volume:double, revenue_at_risk_usd:double, negotiation_priority_score:double)



LLM Raw SQL:
 SELECT item_key, SUM(revenue_at_risk_usd) AS total_revenue_at_risk FROM default.adobe.test_data GROUP BY item_key ORDER BY total_revenue_at_risk DESC LIMIT 100

Running SQL:
 SELECT item_key, SUM(revenue_at_risk_usd) AS total_revenue_at_risk FROM default.adobe.test_data GROUP BY item_key ORDER BY total_revenue_at_risk DESC LIMIT 100


+--------+---------------------+
|item_key|total_revenue_at_risk|
+--------+---------------------+
|10006   |4.026432456369999E9  |
|10001   |3.063578901999997E9  |
|10005   |1.338632333998E9     |
|10004   |9.790932062039998E8  |
|10003   |6.957904390559996E8  |
|10002   |5.069349754070001E8  |
+--------+---------------------+




LLM Analysis:
 - **Overall Trend**: The total revenue at risk shows a general declining pattern across item keys from 10006 to 10002, with values decreasing from billions to hundreds of millions, suggesting higher-numbered items may carry greater financial exposure or risk factors.
- **Top Value**: Item 10006 stands out with the highest revenue at risk of approximately $4.03 billion, representing over 30% of the total across all items and indicating it as a critical asset requiring immediate risk mitigation.
- **Bottom Value**: Item 10002 has the lowest revenue at risk at around $507 million, which is roughly 12% of the top item's value, potentially signaling lower priority for resource allocation in risk management.
- **Anomaly Detected**: Item 10001 exhibits an unusually high revenue at risk of about $3.06 billion, disrupting the otherwise sequential decline from 10006 to 10002; this could indicate data entry errors, unique risk events, or external factors warranting further investi