In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import time
import random

# Initialize Spark Session
# Note: For better visibility on the Spark UI, ensure you run this on a cluster 
# with at least 2 executors/workers.
spark = SparkSession.builder \
    .appName("SparkUIDebugger") \
    .getOrCreate()

# Set logging level for cleaner console output
spark.sparkContext.setLogLevel("WARN")

# --- 1. SETUP: Create a large baseline DataFrame (10 Million Records) ---
NUM_RECORDS = 10_000_000

# Create an RDD for efficient data generation
rdd = spark.sparkContext.parallelize(range(NUM_RECORDS))

# Convert RDD to DataFrame with a unique ID and a random group key (1 to 100)
base_df = rdd.map(lambda i: (i, random.randint(1, 100), f"Item_{i}")) \
             .toDF(["id", "group_key", "item_name"]) \
             .repartition(10) # Ensure data is split across partitions for visualization

print(f"Created base DataFrame with {NUM_RECORDS} records and 10 partitions.")
base_df.cache() # Cache for re-use and to demonstrate the Storage tab
base_df.count() # Action to materialize the cache


25/11/26 21:03:58 WARN Utils: Your hostname, user-HP-Pavilion-x360-Convertible-14-dh0xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.24 instead (on interface wlo1)
25/11/26 21:03:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/26 21:03:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Created base DataFrame with 10000000 records and 10 partitions.


                                                                                

10000000

In [None]:
# =======================================================================
# SCENARIO A: High Shuffle & Wide Transformation (Jobs and Stages Tab)
# Goal: Observe a large "Shuffle Write" and "Shuffle Read" step.
# =======================================================================

In [None]:
print("\n--- Running Scenario A: High Shuffle Aggregation ---")
print("Spark UI Tip: Look at the SQL/Graph tab for a large 'Exchange' node.")

# This operation requires a full redistribution of data across the network (Shuffle)
# to group all matching 'group_key' records together for the aggregation.
df_shuffled = base_df.groupBy("group_key").agg(
    F.count("id").alias("total_count"),
    F.max("id").alias("max_id")
)

# Trigger the action and measure the time
start_time_a = time.time()
df_shuffled.collect() # Use collect() to force computation to the driver
end_time_a = time.time()

print(f"Scenario A completed in {end_time_a - start_time_a:.2f} seconds.")
print("In the Spark UI: Go to the 'Stages' tab for the latest Job. Look for the 'Shuffle Read' and 'Shuffle Write' metrics. They should be significantly high.")
# 


In [None]:
# =======================================================================
# SCENARIO B: Data Skew (Stages Tab: Stragglers)
# Goal: Artificially create a straggler task where one task processes 99% of data.
# =======================================================================

print("\n--- Running Scenario B: Data Skew ---")
print("Spark UI Tip: Look for one task taking significantly longer than others in the 'Stages' tab.")

spark.conf.set("spark.sql.adaptive.enabled", "false")

# Create skewed data: 99% of records get 'group_key' = 9999
skewed_rdd = spark.sparkContext.parallelize(range(NUM_RECORDS))
skewed_df = skewed_rdd.map(lambda i: (i, 9999 if i < NUM_RECORDS * 0.99 else random.randint(1, 10), f"SkewItem_{i}")) \
                      .toDF(["id", "skew_key", "item_name"])

# The aggregation will force a shuffle on 'skew_key'. 
# Since almost all data belongs to key '9999', one partition's task 
# will be massive (the straggler).
df_skewed_agg = skewed_df.groupBy("skew_key").agg(F.count("id").alias("count"))

# Trigger the action
start_time_b = time.time()
df_skewed_agg.collect() 
end_time_b = time.time()

print(f"Scenario B completed in {end_time_b - start_time_b:.2f} seconds.")
print("In the Spark UI: Go to the 'Stages' tab. Locate the stage with the aggregation. You should see Task metrics where the 'Duration' of one task is far greater than the others, and its 'Input Size' is much larger (Data Skew).")
# 


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
import random

# -------------------------------
# 1. Disable AQE
# -------------------------------

spark = (
    SparkSession.builder
        .appName("AQE_Off_Skew_Test")
        .config("spark.sql.adaptive.enabled", "false")  # Turn off AQE
        .config("spark.sql.adaptive.skewJoin.enabled", "false")
        .config("spark.sql.adaptive.coalescePartitions.enabled", "false")
        .getOrCreate()
)

spark.conf.set("spark.sql.shuffle.partitions", "50")   # Helps visualize skew
print("AQE Enabled?:", spark.conf.get("spark.sql.adaptive.enabled"))


# -------------------------------
# 2. Create skewed DataFrame
# -------------------------------

# Many rows with key=1 (majority) to create skew
skewed_data = []
for i in range(200000):  # 2 million rows
    if i < 180000:
        skewed_data.append((1, random.randint(1, 100)))
    else:
        skewed_data.append((i % 1000 + 2, random.randint(1, 100)))

df_skewed = spark.createDataFrame(skewed_data, ["id", "value"])


# -------------------------------
# 3. Create a small lookup table
# -------------------------------

lookup_data = [(i, f"name_{i}") for i in range(1, 500)]
df_lookup = spark.createDataFrame(lookup_data, ["id", "name"])


# -------------------------------
# 4. Trigger skew with a join
# -------------------------------

joined_df = df_skewed.join(df_lookup, "id", "inner")

print("Starting skewed join...")
joined_df.count()   # ACTION to force execution


# -------------------------------
# 5. Stop session
# -------------------------------

spark.stop()


In [2]:
# =======================================================================
# SCENARIO C: UDF Bottleneck (Stages Tab: Task CPU Time)
# Goal: Simulate a slow User Defined Function (UDF) that dominates CPU time.
# =======================================================================

print("\n--- Running Scenario C: UDF Bottleneck ---")
print("Spark UI Tip: Look at the 'Summary Metrics' in the Stages tab for high 'Executor CPU Time'.")

# 1. Define a very slow UDF that sleeps (simulates complex, non-optimized logic)
def slow_transform(val):
    """Simulates a slow, expensive operation."""
    time.sleep(0.0001) # Small sleep, but multiplies by 10 million rows
    return val.upper()

slow_udf = F.udf(slow_transform, StringType())

# 2. Apply the slow UDF to the large DataFrame (requires a local Python context execution)
df_slow_udf = base_df.withColumn("transformed_name", slow_udf(F.col("item_name")))

# 3. Trigger the action
start_time_c = time.time()
df_slow_udf.count() # Count forces execution
end_time_c = time.time()

print(f"Scenario C completed in {end_time_c - start_time_c:.2f} seconds.")
print("In the Spark UI: Go to the 'Stages' tab. This stage should show a high value for 'Executor CPU Time' and low 'Shuffle Read/Write', indicating the bottleneck is in computation, not networking.")
# 




--- Running Scenario C: UDF Bottleneck ---
Spark UI Tip: Look at the 'Summary Metrics' in the Stages tab for high 'Executor CPU Time'.
Scenario C completed in 0.48 seconds.
In the Spark UI: Go to the 'Stages' tab. This stage should show a high value for 'Executor CPU Time' and low 'Shuffle Read/Write', indicating the bottleneck is in computation, not networking.


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
import threading
from datetime import datetime

from py4j.java_gateway import java_import

# Stores all metrics
job_metrics = {}
stage_metrics = {}
task_metrics = {}


class FullMetricsListener:
    def __init__(self):
        self.lock = threading.Lock()

    # --------------------------
    #   JOB METRICS
    # --------------------------
    def onJobStart(self, jobStart):
        with self.lock:
            job_id = jobStart.jobId()
            job_metrics[job_id] = {
                "job_id": job_id,
                "status": "RUNNING",
                "stage_ids": list(jobStart.stageIds()),
                "start_time": datetime.now(),
                "end_time": None,
                "duration_ms": None
            }

    def onJobEnd(self, jobEnd):
        with self.lock:
            job_id = jobEnd.jobId()
            status = (
                "SUCCEEDED" if "JobSucceeded" in str(jobEnd.jobResult()) 
                else "FAILED"
            )

            job_metrics[job_id]["status"] = status
            job_metrics[job_id]["end_time"] = datetime.now()

            start = job_metrics[job_id]["start_time"]
            end = job_metrics[job_id]["end_time"]
            job_metrics[job_id]["duration_ms"] = (end - start).total_seconds() * 1000

    # --------------------------
    #   STAGE METRICS
    # --------------------------
    def onStageSubmitted(self, stageSubmitted):
        with self.lock:
            info = stageSubmitted.stageInfo()
            stage_id = info.stageId()

            stage_metrics[stage_id] = {
                "stage_id": stage_id,
                "name": info.name(),
                "status": "RUNNING",
                "num_tasks": info.numTasks(),
                "start_time": datetime.now(),
                "end_time": None,
                "duration_ms": None,
                "shuffle_read_bytes": 0,
                "shuffle_write_bytes": 0,
                "input_bytes": 0,
                "output_bytes": 0,
                "executor_run_time_ms": 0,
                "gc_time_ms": 0
            }

    def onStageCompleted(self, stageCompleted):
        with self.lock:
            info = stageCompleted.stageInfo()
            stage_id = info.stageId()

            metrics = info.taskMetrics()

            stage_metrics[stage_id]["status"] = "SUCCEEDED"
            stage_metrics[stage_id]["end_time"] = datetime.now()

            start = stage_metrics[stage_id]["start_time"]
            end = stage_metrics[stage_id]["end_time"]

            stage_metrics[stage_id]["duration_ms"] = (end - start).total_seconds() * 1000

            # Metrics
            stage_metrics[stage_id]["shuffle_read_bytes"] = metrics.shuffleReadMetrics().totalBytesRead()
            stage_metrics[stage_id]["shuffle_write_bytes"] = metrics.shuffleWriteMetrics().bytesWritten()
            stage_metrics[stage_id]["input_bytes"] = metrics.inputMetrics().bytesRead()
            stage_metrics[stage_id]["output_bytes"] = metrics.outputMetrics().bytesWritten()
            stage_metrics[stage_id]["executor_run_time_ms"] = metrics.executorRunTime()
            stage_metrics[stage_id]["gc_time_ms"] = metrics.jvmGCTime()

    # --------------------------
    #   TASK METRICS
    # --------------------------
    def onTaskEnd(self, taskEnd):
        with self.lock:
            info = taskEnd.taskInfo()
            metrics = taskEnd.taskMetrics()

            task_id = info.taskId()

            task_metrics[task_id] = {
                "task_id": task_id,
                "stage_id": info.stageId(),
                "host": info.host(),
                "executor_id": info.executorId(),
                "status": "FINISHED",
                "duration_ms": info.duration(),
                "input_bytes": metrics.inputMetrics().bytesRead(),
                "output_bytes": metrics.outputMetrics().bytesWritten(),
                "shuffle_read_bytes": metrics.shuffleReadMetrics().totalBytesRead(),
                "shuffle_write_bytes": metrics.shuffleWriteMetrics().bytesWritten(),
                "executor_cpu_time_ms": metrics.executorCpuTime() / 1e6,
                "executor_run_time_ms": metrics.executorRunTime(),
                "gc_time_ms": metrics.jvmGCTime()
            }


In [4]:
sc = spark.sparkContext

gateway = sc._gateway
java_import(gateway.jvm, "org.apache.spark.*")

listener = FullMetricsListener()
jlistener = gateway.jvm.PythonListener(listener)

sc._jsc.sc().addSparkListener(jlistener)


TypeError: 'JavaPackage' object is not callable