# Consulting Data Pipeline

Processes consulting data for Lobbi Consulting and Medicare Consulting entities.
Reads engagements, activities, and shared prospects to calculate:
- Pipeline value by stage and entity
- Lead conversion rates
- Engagement health scores

**Schedule:** Daily at 5:00 AM UTC

In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import (
    col, lit, current_timestamp, when, coalesce, count, sum as spark_sum,
    avg, datediff, to_date, row_number, round as spark_round,
    months_between, current_date
)
from pyspark.sql.types import DoubleType, IntegerType
from datetime import datetime

LAKEHOUSE = "lh-consulting-data"
BASE_PATH = f"abfss://{LAKEHOUSE}@onelake.dfs.fabric.microsoft.com"

spark = SparkSession.builder.getOrCreate()

# Read all consulting tables
df_accounts = spark.read.format("delta").load(f"{BASE_PATH}/Tables/dv_accounts")
df_contacts = spark.read.format("delta").load(f"{BASE_PATH}/Tables/dv_contacts")
df_engagements = spark.read.format("delta").load(f"{BASE_PATH}/Tables/dv_engagements")
df_activities = spark.read.format("delta").load(f"{BASE_PATH}/Tables/dv_activities")
df_prospects = spark.read.format("delta").load(f"{BASE_PATH}/Tables/dv_sharedprospects")
df_implementations = spark.read.format("delta").load(f"{BASE_PATH}/Tables/dv_implementations")

# Entity label mapping
ENTITY_LABELS = {
    100000000: "Lobbi Consulting",
    100000001: "Medicare Consulting"
}

# Engagement status labels
STATUS_LABELS = {
    100000000: "Proposed",
    100000001: "Negotiation",
    100000002: "Active",
    100000003: "On Hold",
    100000004: "Completed",
    100000005: "Canceled"
}

print(f"Pipeline started at {datetime.utcnow().isoformat()}")
print(f"Accounts: {df_accounts.count()}, Engagements: {df_engagements.count()}")
print(f"Activities: {df_activities.count()}, Prospects: {df_prospects.count()}")
print(f"Implementations: {df_implementations.count()}")

In [None]:
# ── Cell 2: Pipeline Value Analysis ──────────────────────────────────────

# Cast value columns
df_eng = (
    df_engagements
    .withColumn("value", col("tvs_value").cast(DoubleType()))
    .withColumn("invoiced", col("tvs_invoicedamount").cast(DoubleType()))
    .withColumn("status", col("tvs_status").cast(IntegerType()))
    .withColumn("eng_type", col("tvs_type").cast(IntegerType()))
    .withColumn("start_date", to_date(col("tvs_startdate")))
    .withColumn("end_date", to_date(col("tvs_enddate")))
)

# Join with accounts for entity info
df_eng_with_entity = (
    df_eng.alias("e")
    .join(
        df_accounts.select(
            col("tvs_consultingaccountid").alias("acct_id"),
            col("tvs_entity").alias("entity_code"),
            col("tvs_name").alias("account_name")
        ).alias("a"),
        col("e.tvs_accountid") == col("a.acct_id"),
        "left"
    )
    .withColumn("entity_name",
        when(col("entity_code") == 100000000, lit("Lobbi Consulting"))
        .when(col("entity_code") == 100000001, lit("Medicare Consulting"))
        .otherwise(lit("Unknown"))
    )
)

# Pipeline value by status and entity
df_pipeline = (
    df_eng_with_entity
    .groupBy("entity_name", "status")
    .agg(
        count("*").alias("engagement_count"),
        spark_sum("value").alias("total_value"),
        avg("value").alias("avg_value"),
        spark_sum("invoiced").alias("total_invoiced")
    )
    .withColumn("status_label",
        when(col("status") == 100000000, lit("Proposed"))
        .when(col("status") == 100000001, lit("Negotiation"))
        .when(col("status") == 100000002, lit("Active"))
        .when(col("status") == 100000003, lit("On Hold"))
        .when(col("status") == 100000004, lit("Completed"))
        .when(col("status") == 100000005, lit("Canceled"))
        .otherwise(lit("Unknown"))
    )
    .withColumn("weighted_pipeline_value",
        when(col("status") == 100000000, col("total_value") * 0.1)
        .when(col("status") == 100000001, col("total_value") * 0.4)
        .when(col("status") == 100000002, col("total_value") * 0.9)
        .when(col("status") == 100000004, col("total_value") * 1.0)
        .otherwise(col("total_value") * 0.0)
    )
    .withColumn("calculated_at", current_timestamp())
)

df_pipeline.write.format("delta").mode("overwrite").save(f"{BASE_PATH}/Tables/agg_pipeline")
print(f"Pipeline analysis written: {df_pipeline.count()} segments")
df_pipeline.select("entity_name", "status_label", "engagement_count", "total_value", "weighted_pipeline_value").show(truncate=False)

In [None]:
# ── Cell 3: Conversion Rates ─────────────────────────────────────────────

# SharedProspects conversion analysis
df_prospect_analysis = (
    df_prospects
    .withColumn("source_label",
        when(col("tvs_sourceentity") == 100000000, lit("TAIA"))
        .when(col("tvs_sourceentity") == 100000001, lit("TVS"))
        .when(col("tvs_sourceentity") == 100000002, lit("Lobbi Consulting"))
        .when(col("tvs_sourceentity") == 100000003, lit("Medicare Consulting"))
        .when(col("tvs_sourceentity") == 100000004, lit("Media Company"))
        .otherwise(lit("Unknown"))
    )
    .withColumn("target_label",
        when(col("tvs_targetentity") == 100000000, lit("TAIA"))
        .when(col("tvs_targetentity") == 100000001, lit("TVS"))
        .when(col("tvs_targetentity") == 100000002, lit("Lobbi Consulting"))
        .when(col("tvs_targetentity") == 100000003, lit("Medicare Consulting"))
        .when(col("tvs_targetentity") == 100000004, lit("Media Company"))
        .otherwise(lit("Unknown"))
    )
    .withColumn("is_converted", col("tvs_status") == 100000003)
    .withColumn("referral_date", to_date(col("tvs_referraldate")))
    .withColumn("converted_date", to_date(col("tvs_converteddate")))
    .withColumn("days_to_convert",
        when(col("is_converted"), datediff(col("converted_date"), col("referral_date")))
        .otherwise(lit(None))
    )
)

# Conversion rates by source-target entity pair
df_conversion_rates = (
    df_prospect_analysis
    .groupBy("source_label", "target_label")
    .agg(
        count("*").alias("total_referrals"),
        count(when(col("is_converted"), True)).alias("converted"),
        count(when(col("tvs_status") == 100000004, True)).alias("disqualified"),
        count(when(col("tvs_status") == 100000005, True)).alias("stale"),
        avg("days_to_convert").alias("avg_days_to_convert")
    )
    .withColumn("conversion_rate",
        spark_round(col("converted") / col("total_referrals") * 100, 2)
    )
    .withColumn("calculated_at", current_timestamp())
)

df_conversion_rates.write.format("delta").mode("overwrite").save(f"{BASE_PATH}/Tables/agg_conversion_rates")
print(f"Conversion rates written: {df_conversion_rates.count()} entity pairs")
df_conversion_rates.show(truncate=False)

In [None]:
# ── Cell 4: Engagement Health Scores ─────────────────────────────────────

# Score each active engagement based on multiple health indicators

df_active_engagements = df_eng_with_entity.filter(col("status").isin([100000002, 100000003]))

# Activity recency per engagement
df_activity_metrics = (
    df_activities
    .withColumn("completed", to_date(col("tvs_completeddate")))
    .groupBy(col("tvs_regardingid").alias("engagement_id"))
    .agg(
        count("*").alias("total_activities"),
        count(when(col("tvs_status") == 100000002, True)).alias("completed_activities"),
        spark_sum(when(col("tvs_type") == 100000000, lit(1)).otherwise(lit(0))).alias("meeting_count"),
        datediff(current_date(), coalesce(spark_round(avg(col("completed").cast("long")), 0).cast("date"), current_date())).alias("avg_days_since_activity")
    )
)

# Implementation progress
df_impl_metrics = (
    df_implementations
    .groupBy(col("tvs_engagementid").alias("engagement_id"))
    .agg(
        count("*").alias("total_phases"),
        count(when(col("tvs_status") == 100000003, True)).alias("completed_phases"),
        count(when(col("tvs_status") == 100000002, True)).alias("at_risk_phases"),
        avg(coalesce(col("tvs_completionpercent"), lit(0))).alias("avg_completion_pct")
    )
)

# Calculate engagement health
df_eng_health = (
    df_active_engagements
    .join(df_activity_metrics, col("tvs_engagementid") == df_activity_metrics["engagement_id"], "left")
    .join(df_impl_metrics, col("tvs_engagementid") == df_impl_metrics["engagement_id"], "left")
    .withColumn("duration_months", months_between(current_date(), col("start_date")))
    # Score components
    .withColumn("activity_score",
        when(coalesce(col("total_activities"), lit(0)) >= 5, lit(25))
        .when(coalesce(col("total_activities"), lit(0)) >= 2, lit(15))
        .otherwise(lit(5))
    )
    .withColumn("progress_score",
        spark_round(coalesce(col("avg_completion_pct"), lit(0)) * 0.35, 2)
    )
    .withColumn("risk_penalty",
        when(coalesce(col("at_risk_phases"), lit(0)) > 2, lit(-20))
        .when(coalesce(col("at_risk_phases"), lit(0)) > 0, lit(-10))
        .otherwise(lit(0))
    )
    .withColumn("billing_score",
        when(col("invoiced") > col("value") * 0.7, lit(25))
        .when(col("invoiced") > col("value") * 0.3, lit(15))
        .otherwise(lit(5))
    )
    .withColumn("health_score",
        spark_round(
            col("activity_score") + col("progress_score") + col("risk_penalty") + col("billing_score") + lit(15),
            0
        )
    )
    .withColumn("health_score",
        when(col("health_score") > 100, lit(100))
        .when(col("health_score") < 0, lit(0))
        .otherwise(col("health_score"))
    )
    .withColumn("health_label",
        when(col("health_score") >= 75, lit("On Track"))
        .when(col("health_score") >= 50, lit("Needs Attention"))
        .when(col("health_score") >= 25, lit("At Risk"))
        .otherwise(lit("Critical"))
    )
    .withColumn("calculated_at", current_timestamp())
    .select(
        "tvs_engagementid", "tvs_name", "entity_name", "account_name",
        "value", "invoiced", "duration_months",
        "total_activities", "completed_activities", "total_phases",
        "completed_phases", "at_risk_phases", "avg_completion_pct",
        "health_score", "health_label", "calculated_at"
    )
)

df_eng_health.write.format("delta").mode("overwrite").save(f"{BASE_PATH}/Tables/agg_engagement_health")
print(f"Engagement health written: {df_eng_health.count()} engagements")
print("\nHealth Distribution:")
df_eng_health.groupBy("health_label").count().show()
print(f"Pipeline completed at {datetime.utcnow().isoformat()}")