In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import expr, current_timestamp, col
import math

spark = SparkSession.builder.appName("SupplierItem_Grouped_Analysis").getOrCreate()

# CONFIG
TABLE_FACTS = "default.adobe.test_data"
TABLE_SUPPLIER = "default.adobe.dim_supplier"
TABLE_ITEM = "default.adobe.dim_item"
TABLE_TIME = "default.adobe.dim_time"

OUTPUT_TABLE = "default.adobe.supplier_item_group_analysis"

# PARAMETERS
MAX_ROWS_PER_GROUP = 500      # cap number of time rows sent to LLM per group (adjustable)
SKIP_ALREADY_PROCESSED = True # skip supplier-item pairs already present in OUTPUT_TABLE

# Ensure output table exists
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {OUTPUT_TABLE} (
  Supplier_Key STRING,
  Item_Key STRING,
  Analysis STRING,
  run_timestamp TIMESTAMP
)
USING DELTA
""")

print("Output table ready:", OUTPUT_TABLE)

# Load tables
df_facts = spark.table(TABLE_FACTS)
df_supplier = spark.table(TABLE_SUPPLIER)
df_item = spark.table(TABLE_ITEM)
df_time = spark.table(TABLE_TIME)

print("Loaded tables:")
print(" facts rows:", df_facts.count())
print(" supplier rows:", df_supplier.count())
print(" item rows:", df_item.count())
print(" time rows:", df_time.count())

# Build joined fact rows with context
joined = (
    df_facts.alias("f")
    .join(df_supplier.alias("s"), col("f.supplier_key") == col("s.supplier_key"), "left")
    .join(df_item.alias("i"), col("f.item_key") == col("i.item_key"), "left")
    .join(df_time.alias("t"), col("f.time_key") == col("t.time_key"), "left")
)

# Prepare set of groups to process
groups_df = joined.select("f.supplier_key", "f.item_key").distinct().na.drop()
groups = [(row["supplier_key"], row["item_key"]) for row in groups_df.collect()]
print("Total supplier-item groups to consider:", len(groups))

# Load already processed (optional)
processed_pairs = set()
if SKIP_ALREADY_PROCESSED:
    try:
        processed = spark.table(OUTPUT_TABLE).select("Supplier_Key", "Item_Key").distinct().collect()
        processed_pairs = set((r["Supplier_Key"], r["Item_Key"]) for r in processed)
        print("Already processed groups found:", len(processed_pairs))
    except Exception as e:
        print("Could not read output table for processed pairs:", e)
        processed_pairs = set()

# Utility: create CSV text from rows (list of dicts)
def rows_to_csv(rows):
    if not rows:
        return ""
    # ensure deterministic column order
    cols = list(rows[0].keys())
    csv_lines = [",".join(cols)]
    for r in rows:
        values = []
        for c in cols:
            v = r.get(c, "")
            if v is None:
                v = ""
            s = str(v)
            # escape quotes and commas by quoting
            if ("," in s) or ("\n" in s) or ('"' in s):
                s = '"' + s.replace('"', '""') + '"'
            values.append(s)
        csv_lines.append(",".join(values))
    return "\n".join(csv_lines)

# LLM analysis function (single grouped CSV -> 1 analysis)
def llm_analyze_group(csv_text, supplier_key, item_key):
    prompt = f"""
You are a concise business analyst. You will receive a CSV containing time-series procurement rows for one supplier-item pair.
Produce a clear analysis (5-7 bullet points). Focus on:
- price trend and volatility
- contract expiry risk (Months_to_Contract_Expiry)
- price gap vs best historical bid
- forecasted volume and spend implications
- negotiation priority and actionable recommendation

Do not include anything other than the 5-7 bullet points (each on its own line, starting with a dash).

CSV:
{csv_text}
"""
    df_prompt = spark.createDataFrame([Row(prompt=prompt)])
    out = df_prompt.select(expr("query_model('default.oci_ai_models.xai.grok-4', prompt) as analysis_text"))
    analysis = out.collect()[0]["analysis_text"]
    return analysis.strip()

# Save result
def save_group_analysis(supplier_key, item_key, analysis_text):
    df = spark.createDataFrame(
        [(str(supplier_key), str(item_key), analysis_text)]
    , ["Supplier_Key", "Item_Key", "Analysis"]).withColumn("run_timestamp", current_timestamp())
    df.write.mode("append").format("delta").saveAsTable(OUTPUT_TABLE)

# Main loop: group and call LLM once per supplier-item
total = len(groups)
processed_count = 0
skipped_count = 0
failed_count = 0

for idx, (s_key, i_key) in enumerate(groups, start=1):
    key_tuple = (str(s_key), str(i_key))
    if SKIP_ALREADY_PROCESSED and key_tuple in processed_pairs:
        skipped_count += 1
        print(f"[{idx}/{total}] Skipping already-processed Supplier={s_key} Item={i_key}")
        continue

    print(f"[{idx}/{total}] Processing Supplier={s_key} Item={i_key}")

    # fetch rows for this group ordered by time (limit MAX_ROWS_PER_GROUP)
    try:
        group_rows = (
            joined
            .filter((col("f.supplier_key") == s_key) & (col("f.item_key") == i_key))
            .select(
                "f.supplier_key","f.item_key","f.time_key",
                "f.Current_Contract_Price_USD","f.Months_to_Contract_Expiry","f.Current_Avg_Price_USD",
                "f.Annual_Spend_USD","f.Internal_Min_Price_USD","f.Internal_Max_Price_USD",
                "f.Best_Historical_Bid_USD","f.Price_Gap_vs_BestBid","f.Current_Payment_Terms",
                "f.Avg_Days_to_Pay","f.Forecasted_Volume","f.Revenue_at_Risk_USD","f.Negotiation_Priority_Score",
                "s.supplier_name","i.item_number","i.item_description","t.year","t.fiscal_quarter","t.month"
            )
            .orderBy("f.time_key")
            .limit(MAX_ROWS_PER_GROUP)
            .collect()
        )
    except Exception as e:
        print(f"  Error fetching group rows for S={s_key} I={i_key}: {e}")
        failed_count += 1
        continue

    if not group_rows:
        print(f"  No fact rows for Supplier={s_key} Item={i_key}, skipping.")
        skipped_count += 1
        continue

    # convert Row objects to dicts
    row_dicts = [r.asDict() for r in group_rows]
    csv_text = rows_to_csv(row_dicts)

    print(f"  Rows to send: {len(row_dicts)} (capped at {MAX_ROWS_PER_GROUP})")
    try:
        analysis = llm_analyze_group(csv_text, s_key, i_key)
        save_group_analysis(s_key, i_key, analysis)
        processed_count += 1
        print(f"  Saved analysis for S={s_key} I={i_key}")
    except Exception as e:
        print(f"  LLM analysis/save failed for S={s_key} I={i_key}: {e}")
        failed_count += 1
        continue

print("Finished.")
print("Processed:", processed_count, "Skipped:", skipped_count, "Failed:", failed_count)


Output table ready: default.adobe.supplier_item_group_analysis


Loaded tables:


 facts rows: 1200
 supplier rows: 6


 item rows: 6
 time rows: 200


Total supplier-item groups to consider: 6


Already processed groups found: 0
[1/6] Processing Supplier=101 Item=10001


  Rows to send: 200 (capped at 500)


  Saved analysis for S=101 I=10001
[2/6] Processing Supplier=102 Item=10002


  Rows to send: 200 (capped at 500)


  Saved analysis for S=102 I=10002
[3/6] Processing Supplier=103 Item=10003


  Rows to send: 200 (capped at 500)


  Saved analysis for S=103 I=10003
[4/6] Processing Supplier=104 Item=10004


  Rows to send: 200 (capped at 500)


  Saved analysis for S=104 I=10004
[5/6] Processing Supplier=105 Item=10005


  Rows to send: 200 (capped at 500)


  Saved analysis for S=105 I=10005
[6/6] Processing Supplier=106 Item=10006


  Rows to send: 200 (capped at 500)


  Saved analysis for S=106 I=10006
Finished.
Processed: 6 Skipped: 0 Failed: 0
