In [1]:
# ============================================================
# 1. Set up Spark Session
# ============================================================

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Seattle911")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/23 03:29:58 WARN Utils: Your hostname, Phongs-MacBook-Pro-23.local, resolves to a loopback address: 127.0.0.1; using 10.0.0.46 instead (on interface en0)
25/11/23 03:29:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/23 03:29:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/23 03:29:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/11/23 03:29:59 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
# ============================================================
# 2. Data Import
# ============================================================

import os
from pathlib import Path

PROJECT_ROOT = Path("../../").resolve()

data_path = PROJECT_ROOT / "data" / "processed" / "calldata_20251019_processed.parquet"

df = spark.read.parquet(str(data_path))
print(f"Loaded {df.count():,} rows with {len(df.columns)} columns")

Loaded 848,167 rows with 22 columns


In [3]:
# ============================================================
# 3. Data Filtering and Feature Setup
# ============================================================
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

label_col = "First SPD Call Sign Response Time (s)"

raw_feats = [
    'Priority',
    'Dispatch Neighborhood',
    'Dispatch Sector',
    'Count Of Officers',
    'is_rush_hour',
    'is_nighttime',
    'priority_x_officers',
    'priority_x_hour',
    'event_ts',
    'year',
    'month',
    'day',
    'day_of_week',
    'hour',
    'TEMP',
    'PRCP',
    'COCO',
    'weather_severity',
    'is_raining',
    'is_freezing'
]

num_feats = [
    'Priority',
    'Count Of Officers',
    'is_rush_hour',
    'is_nighttime',
    'priority_x_officers',
    'priority_x_hour',
    'year',
    'month',
    'day',
    'day_of_week',
    'hour',
    'TEMP',
    'PRCP',
    'weather_severity',
    'is_raining',
    'is_freezing'
]

cat_feats = ["Dispatch Neighborhood", "Dispatch Sector"]

def to_d(c):
    return F.coalesce(F.col(c).cast("double"), F.lit(0.0))

indexers = [
    StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep")
    for c in cat_feats
]

encoders = [
    OneHotEncoder(inputCols=[f"{c}_idx"], outputCols=[f"{c}_vec"])
    for c in cat_feats
]

assembled_feats = [f"{c}_vec" for c in cat_feats] + num_feats
assembler = VectorAssembler(inputCols=assembled_feats, outputCol="features")

df_labeled = df.withColumn("label", F.col(label_col)).dropna(subset=["label"])

train, test = df_labeled.randomSplit([0.8, 0.2], seed=42)
train = train.cache()
print(f"Train size: {train.count():,}, Test size: {test.count():,}")

                                                                                

Train size: 678,581, Test size: 169,586


                                                                                

In [4]:
# ============================================================
# 4. Random Forest Baseline Model
# ============================================================

from pyspark.ml.regression import RandomForestRegressor, RandomForestRegressionModel
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator


rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction",
    seed=42,
    numTrees=20,
    maxDepth=8,
    maxBins=64,
    subsamplingRate=0.8,
    featureSubsetStrategy="auto"
)

rf_pipeline = Pipeline(stages=indexers + encoders + [assembler, rf])

In [5]:
# ============================================================
# 5. Fit RF Model
# ============================================================

rf_model = rf_pipeline.fit(train)
rf_pred = rf_model.transform(test)

25/11/23 03:30:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/11/23 03:30:13 WARN DAGScheduler: Broadcasting large task binary with size 1223.9 KiB
                                                                                

In [None]:
# ============================================================
# 6. Evaluate RF Model
# ============================================================

rf_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae"
).evaluate(rf_pred)

rf_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse"
).evaluate(rf_pred)

rf_mse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mse"
).evaluate(rf_pred)

rf_r2 = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="r2"
).evaluate(rf_pred)

print("\n Random Forest Baseline Results:")
print(f"MAE:   {rf_mae:,.2f}")
print(f"RMSE:  {rf_rmse:,.2f}")
print(f"MSE:   {rf_mse:,.2f}")
print(f"RÂ²:    {rf_r2:,.4f}")

                                                                                


ðŸ“Š Random Forest Baseline Results:
MAE:   956.28
RMSE:  1,318.02
MSE:   1,737,171.30
RÂ²:    0.2164


In [None]:
spark.stop()