In [1]:
import sys, os, socket
print("PY:", sys.executable)
print("CWD:", os.getcwd())
print("HOSTNAME:", socket.gethostname())

PY: /opt/conda/bin/python
CWD: /home/jovyan/work/notebooks
HOSTNAME: spark-jupyter


In [2]:
!java -version

openjdk version "17.0.8.1" 2023-08-24
OpenJDK Runtime Environment (build 17.0.8.1+1-Ubuntu-0ubuntu122.04)
OpenJDK 64-Bit Server VM (build 17.0.8.1+1-Ubuntu-0ubuntu122.04, mixed mode, sharing)


In [3]:
import time
import csv
import gc
import pyspark
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark import SparkContext

# Nach bestehender Spark-Session suchen und stoppen, falls vorhanden
try:
    if 'spark' in globals() and spark is not None:
        spark.stop()
        del spark
except Exception as e:
    print("Fehler beim Stoppen:", e)

try:
    sc = SparkContext._active_spark_context
    if sc is not None:
        sc.stop()
except Exception as e:
    print("Fehler beim Stoppen:", e)

# Garbage-Collector aufrufen und Speicher bereinigen 
gc.collect()

# Spark-Modus einstellen und variable Größe festlegen
MODE = "multi" # "multi"
TAG = "-var_fts" # "-var_dpts"
VAR = "n" # "m" 

# Seed sowie Features und Datenpunkte festlegen (fix und variabel)
SEED = 42
DATAPOINTS_FIX = 50000
DATAPOINTS_VAR = range(500_000, 12_000_000, 500_000) # fine
# DATAPOINTS_VAR = range(1_000_000, 6_000_000, 1_000_000) # test
FEATURES_FIX = 10
FEATURES_VAR = [8, 16, 32, 64, 128, 256, 512, 768, 1_024, 1_536, 2_048]
# NS = [1_000, 2_000, 5_000, 10_000, 20_000, 50_000, 100_000, 200_000, 500_000, 1_000_000, 2_000_000, 5_000_000, 10_000_000, 20_000_000, 50_000_000]

# Konfigurationen für Spark-Session 
DRIVER_CORES = 2
DRIVER_MEMORY = "4g"
EXECUTOR_CORES = 4
EXECUTOR_MEMORY = 8
NUM_PARTITIONS = 3 * EXECUTOR_CORES

if MODE == "local":
    spark = (SparkSession.builder
         .appName("OLS-Local")
         .master(f"local[{EXECUTOR_CORES}]")
         .config("spark.driver.bindAddress", "0.0.0.0")
         .config("spark.driver.host", "spark-jupyter-local")
         .config("spark.ui.port", "4040")
         .config("spark.driver.memory", f"{EXECUTOR_MEMORY}g")
         .config("spark.default.parallelism", NUM_PARTITIONS)
         .config("spark.sql.shuffle.partitions", NUM_PARTITIONS)
         .config("spark.sql.adaptive.enabled", "false")
         .getOrCreate())
elif MODE == "single":
    spark = (SparkSession.builder
         .appName("OLS-Single")
         .master("spark://spark-master:7077")
         .config("spark.driver.bindAddress", "0.0.0.0")
         .config("spark.driver.host", "spark-jupyter")
         .config("spark.ui.port", "4040")
         .config("spark.executor.cores", EXECUTOR_CORES)
         .config("spark.executor.instances", 1)
         .config("spark.executor.memory",  f"{EXECUTOR_MEMORY}g")
         .config("spark.driver.memory", DRIVER_MEMORY)
         .config("spark.driver.cores", DRIVER_CORES)
         .config("spark.default.parallelism", NUM_PARTITIONS)
         .config("spark.sql.shuffle.partitions", NUM_PARTITIONS)
         .config("spark.sql.adaptive.enabled", "false")
         .getOrCreate())
elif MODE == "multi":
    spark = (SparkSession.builder
         .appName("OLS-Multi")
         .master("spark://spark-master:7077")
         .config("spark.driver.bindAddress", "0.0.0.0")
         .config("spark.driver.host", "spark-jupyter")
         .config("spark.ui.port", "4040")
         .config("spark.executor.cores", EXECUTOR_CORES // 2)
         .config("spark.executor.instances", 2)
         .config("spark.executor.memory", f"{EXECUTOR_MEMORY // 2}g")
         .config("spark.driver.cores", DRIVER_CORES)
         .config("spark.driver.memory", DRIVER_MEMORY)
         .config("spark.default.parallelism", NUM_PARTITIONS)
         .config("spark.sql.shuffle.partitions", NUM_PARTITIONS)
         .config("spark.sql.adaptive.enabled", "false")
         .getOrCreate())

# Cache leeren
spark.catalog.clearCache()

print("PySpark:", pyspark.__version__)
print("Spark UI:", spark.sparkContext.uiWebUrl)
print("Default parallelism:", spark.conf.get("spark.default.parallelism"))
print("Anzahl Partitionen:", spark.conf.get("spark.sql.shuffle.partitions"))
print("Driver Memory:", spark.conf.get("spark.driver.memory"))

try:
    print("Anzahl Worker:",  spark.conf.get("spark.executor.instances"))
    print("CPU-Kerne pro Worker:", spark.conf.get("spark.executor.cores"))
    print("Memory pro Worker:", spark.conf.get("spark.executor.memory"))
except:
    pass

PySpark: 3.5.0
Spark UI: http://spark-jupyter:4040
Default parallelism: 12
Anzahl Partitionen: 12
Driver Memory: 4g
Anzahl Worker: 2
CPU-Kerne pro Worker: 2
Memory pro Worker: 4g


In [4]:
from pyspark.sql import functions as F
from pyspark.ml.functions import array_to_vector

# Datensatz generieren und Verteilen auf Partitionen
def gen_spark_data_with_repartitioning(n:int, d:int, seed:int, parts:int):
    df = spark.range(n).repartition(parts)

    feats_arr = F.array(*[F.randn(seed + j) for j in range(d)])

    beta = F.array(*[F.lit(float(j + 1)) for j in range(d)])

    prod = F.zip_with(feats_arr, beta, lambda x, b: x * b)
    dot  = F.aggregate(prod, F.lit(0.0), lambda acc, v: acc + v)

    y = dot + 0.1 * F.randn(seed + d)

    feats_vec = array_to_vector(feats_arr)
    return df.select(feats_vec.alias("features"), y.alias("y"))

# Helper für Ausgabe -> geschätzte Größe in Bytes in gut lesbaren String umwandeln 
def human_readable_size(num_bytes: int):
    if num_bytes < 1024:
        return f"{num_bytes} B"
    elif num_bytes < 1024**2:
        return f"{num_bytes / 1024:.2f} KB"
    elif num_bytes < 1024**3:
        return f"{num_bytes / 1024**2:.2f} MB"
    else:
        return f"{num_bytes / 1024**3:.2f} GB"

# Spark "aufwärmen"
def warm_up(spark):
    tmp = gen_spark_data_with_repartitioning(1_000_000, 10, 1, parts=NUM_PARTITIONS).cache()
    _ = tmp.agg(F.count("*")).collect()
    _ = LinearRegression(featuresCol="features", labelCol="y", regParam=0.0, elasticNetParam=0.0, solver="normal").fit(tmp)

In [5]:
from sparkmeasure import StageMetrics
from pyspark.sql import SparkSession, functions as F
from pyspark.storagelevel import StorageLevel
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
import pandas as pd
import time

# Helper für Zugriff auf SparkSession
def c(key: str, default=None): 
    spark = SparkSession.getActiveSession()
    try:
        return spark.conf.get(key, default)
    except Exception:
        return default

# Helper für Zugriff auf StageMetrics
def g(key: str, metrics: StageMetrics) -> float:
    v = metrics.aggregate_stagemetrics().get(key, 0)
    return 0.0 if v is None else float(v)

def get_theoretical_size_bytes(m: int, n: int, d_size: int = 8):
    return (m * n + m) * d_size

def get_benchmark(m: int, n: int, t_fit: float, model: LinearRegressionModel, metrics: StageMetrics, mode: str):
    spark = SparkSession.getActiveSession()
    sc = spark.sparkContext
    
    elapsed_ms              = g("elapsedTime", metrics)
    executorRun_ms          = g("executorRunTime", metrics)
    cpu_ms                  = g("executorCpuTime", metrics)
    jvm_gc_ms               = g("jvmGCTime", metrics)
    scheduler_delay_ms      = g("schedulerDelay", metrics)
    task_deser_ms           = g("taskDeserializationTime", metrics)
    result_ser_ms           = g("resultSerializationTime", metrics)
    shuffle_read_bytes      = g("shuffleReadBytes", metrics)
    shuffle_write_bytes     = g("shuffleWriteBytes", metrics)
    spilled_mem_bytes       = g("memoryBytesSpilled", metrics)
    spilled_disk_bytes      = g("diskBytesSpilled", metrics)
    bytes_read              = g("bytesRead", metrics)
    bytes_written           = g("bytesWritten", metrics)

    t_fit_ms                = t_fit * 1000
    num_executors           = EXECUTOR_CORES
    wall_overhead_ms        = (t_fit_ms - executorRun_ms / num_executors)
    nonCPU_overhead_ms      = max(0.0, executorRun_ms - cpu_ms)
    parallel_overhead_ms    = wall_overhead_ms + nonCPU_overhead_ms
    wall_overhead_pct       = (wall_overhead_ms / t_fit_ms) if t_fit_ms else None
    nonCPU_overhead_pct     = (nonCPU_overhead_ms / executorRun_ms) if executorRun_ms else None
    parallel_overhead_pct   = (parallel_overhead_ms / t_fit_ms) if t_fit_ms else None
    cpu_efficiency          = (cpu_ms / (num_executors * t_fit_ms)) if (num_executors * t_fit_ms) else None

    row = {
        # Kontext
        "m_datapoints": m,
        "n_features": n,
        "mode": mode,

        # Driver/Executor/Partitionen
        "driver_cores": c("spark.driver.cores", "0"),
        "driver_memory": c("spark.driver.memory", "0"),
        "executor_instances": c("spark.executor.instances", "0"),
        "executor_cores": c("spark.executor.cores", "0"),
        "executor_memory": c("spark.executor.memory", "0"),
        "default_parallelism": sc.defaultParallelism,
        "shuffle_partitions": c("spark.sql.shuffle.partitions", ""),
        "master": sc.master,

        # Modellgüte
        "r2": getattr(getattr(model, "summary", None), "r2", None),
        "rmse": getattr(getattr(model, "summary", None), "rootMeanSquaredError", None),
        "mae": getattr(getattr(model, "summary", None), "meanAbsoluteError", None),
        "explainedVariance": getattr(getattr(model, "summary", None), "explainedVariance", None),

        # Zeit/Gesamt
        "t_fit_ms": t_fit_ms,
        "elapsed_ms": elapsed_ms,
        "executorRun_ms": executorRun_ms,
        "cpu_ms": cpu_ms,
        "gc_ms": jvm_gc_ms,

        # Overheads
        "wall_overhead_ms": wall_overhead_ms,
        "nonCPU_overhead_ms": nonCPU_overhead_ms,
        "parallel_overhead_ms": parallel_overhead_ms,

        # Overhead-Anteile
        "wall_overhead_pct": wall_overhead_pct,
        "nonCPU_overhead_pct": nonCPU_overhead_pct,
        "parallel_overhead_pct": parallel_overhead_pct,

        # Datenbewegung
        "bytes_read": bytes_read,
        "bytes_written": bytes_written,
        "shuffle_read": shuffle_read_bytes,
        "shuffle_write": shuffle_write_bytes,
        "spilled": (spilled_mem_bytes + spilled_disk_bytes),

        # Speicher
        "theoretical_size_bytes": get_theoretical_size_bytes(m, n, d_size=8),
    }
    return row

In [6]:
def run_variable_datapoints(datapoints, features):
    # Schleife für iteratives Erhöhen der Datenpunkte
    for m in datapoints:
        print(f"Datensatz vom Shape {m} x {features} generieren. Geschätze Größe: {human_readable_size(m * features * 8)}")

        # Datensatz generieren
        t0 = time.perf_counter()
        sdf = gen_spark_data_with_repartitioning(m, features, SEED, NUM_PARTITIONS).cache()
        _ = sdf.agg(F.count("*")).collect()
        t1 = time.perf_counter()
        
        print(f"gen: {t1 - t0:.2f}s, m={m}, n={features}, seed={SEED}")
        gen_times.append(round(t1 - t0, 2))

        # OLS-Regression
        lr = LinearRegression(featuresCol="features", labelCol="y", regParam=0.0, elasticNetParam=0.0, solver="normal")
        metrics = StageMetrics(spark)

        # Modell trainieren und Zeit messen
        metrics.begin()
        t0 = time.perf_counter()
        model = lr.fit(sdf)
        t1 = time.perf_counter()
        metrics.end()
        t_fit = round(t1 - t0, 2)

        # Kennzahlen erfassen
        res = get_benchmark(m=m, n=features, t_fit=t_fit, model=model, metrics=metrics, mode=MODE)
        results.append(res)
        
        print(f"fit: {t1 - t0:.2f}s, R2={model.summary.r2:.4f}\n")
        fit_times.append(round(t1 - t0, 2))


def run_variable_features(features, datapoints):
    # Schleife für iteratives Erhöhen der Features
    for n in features:
        print(f"Datensatz vom Shape {datapoints} x {n} generieren. Geschätze Größe: {human_readable_size(datapoints * n * 8)}")

        # Datensatz generieren
        t0 = time.perf_counter()
        sdf = gen_spark_data_with_repartitioning(datapoints, n, SEED, NUM_PARTITIONS).cache()
        _ = sdf.agg(F.count("*")).collect()
        t1 = time.perf_counter()
        
        print(f"gen: {t1 - t0:.2f}s, m={datapoints}, n={n}, seed={SEED}")
        gen_times.append(round(t1 - t0, 2))
        
        # OLS-Regression
        lr = LinearRegression(featuresCol="features", labelCol="y", regParam=0.0, elasticNetParam=0.0, solver="normal")
        metrics = StageMetrics(spark)

        # Modell trainieren und Zeit messen
        metrics.begin()
        t0 = time.perf_counter()
        model = lr.fit(sdf)
        t1 = time.perf_counter()
        metrics.end()
        t_fit = round(t1 - t0, 2)
        
        # Kennzahlen erfassen
        res = get_benchmark(m=datapoints, n=n, t_fit=t_fit, model=model, metrics=metrics, mode=MODE)
        results.append(res)
        
        print(f"fit: {t1 - t0:.2f}s, R2={model.summary.r2:.4f}\n")
        fit_times.append(round(t1 - t0, 2))

In [7]:
gen_times = []
fit_times = []
results = []

try:
    print("Mode:", MODE)
    print("Variable Größe:", VAR)
    print("Driver Memory:", spark.conf.get("spark.driver.memory"))
    print("Anzahl Worker:",  spark.conf.get("spark.executor.instances"))
    print("CPU-Kerne pro Worker:", spark.conf.get("spark.executor.cores"))
    print("Memory pro Worker:", spark.conf.get("spark.executor.memory"))
    print("\n")
except:
    pass

# Spark "aufwärmen"
warm_up(spark)

# Versuchsreihe für eingestellte variable Größe ausführen
if VAR == "m":
    run_variable_datapoints(DATAPOINTS_VAR, FEATURES_FIX)
elif VAR == "n":
    run_variable_features(FEATURES_VAR, DATAPOINTS_FIX)

print("Zeiten für Generierung:", gen_times)
print("Zeiten für Training:", fit_times)

pd.DataFrame(results).to_csv(f"../stats/{MODE}{TAG}.csv", index=False)
print(f"{MODE}{TAG}.csv gespeichert")

Mode: multi
Variable Größe: n
Driver Memory: 4g
Anzahl Worker: 2
CPU-Kerne pro Worker: 2
Memory pro Worker: 4g


Datensatz vom Shape 50000 x 8 generieren. Geschätze Größe: 3.05 MB
gen: 0.81s, m=50000, n=8, seed=42
fit: 0.64s, R2=1.0000

Datensatz vom Shape 50000 x 16 generieren. Geschätze Größe: 6.10 MB
gen: 0.57s, m=50000, n=16, seed=42
fit: 0.55s, R2=1.0000

Datensatz vom Shape 50000 x 32 generieren. Geschätze Größe: 12.21 MB
gen: 0.67s, m=50000, n=32, seed=42
fit: 0.62s, R2=1.0000

Datensatz vom Shape 50000 x 64 generieren. Geschätze Größe: 24.41 MB
gen: 0.73s, m=50000, n=64, seed=42
fit: 0.53s, R2=1.0000

Datensatz vom Shape 50000 x 128 generieren. Geschätze Größe: 48.83 MB
gen: 1.08s, m=50000, n=128, seed=42
fit: 0.53s, R2=1.0000

Datensatz vom Shape 50000 x 256 generieren. Geschätze Größe: 97.66 MB
gen: 1.61s, m=50000, n=256, seed=42
fit: 0.94s, R2=1.0000

Datensatz vom Shape 50000 x 512 generieren. Geschätze Größe: 195.31 MB
gen: 3.00s, m=50000, n=512, seed=42
fit: 2.05s, R2=1.0