In [0]:
%sql
USE CATALOG nyc_taxi;
USE SCHEMA nyc_taxi_schema;

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS nyc_taxi.nyc_taxi_schema.sparkml_cache;

In [0]:
import os

os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/nyc_taxi/nyc_taxi_schema/sparkml_cache"

In [0]:
import gc

#delete any old models so Spark can free cache
for name in ["cvLr", "cvLrModel", "bestLrPipelineModel", "bestLrModel",
             "rfModel", "bestRfPipelineModel", "bestRfModel",
             "lrPipelineModel", "rfPipelineModel", "model"]:
    if name in globals():
        del globals()[name]

gc.collect()

In [0]:
df = spark.table("nyc_taxi.nyc_taxi_schema.yellow_trips_csv_v")
df.createOrReplaceTempView("table")
display(df)
df.printSchema()

2.1 Data Preparation

In [0]:
from pyspark.sql.functions import (
    col, unix_timestamp, hour, dayofweek
)

#base dataset without junk column
df = df.drop("_rescued_data")

#create useful features
dfReg = (
    df
    .withColumn("pickup_ts",  unix_timestamp("tpep_pickup_datetime"))
    .withColumn("dropoff_ts", unix_timestamp("tpep_dropoff_datetime"))
    .withColumn("trip_duration_min", (col("dropoff_ts") - col("pickup_ts")) / 60.0)
    .withColumn("pickup_hour", hour("tpep_pickup_datetime"))
    .withColumn("pickup_dow", dayofweek("tpep_pickup_datetime"))
)

#basic filtering
dfReg = (
    dfReg
    .filter(col("fare_amount") > 0)
    .filter(col("trip_distance") > 0)
    .filter(col("trip_duration_min") > 0)
    .filter(col("passenger_count") > 0)
)

numericFeatures = [
    "passenger_count",
    "trip_distance",
    "trip_duration_min",
    "pickup_hour",
    "pickup_dow",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "improvement_surcharge"
]

categoricalFeatures = [
    "VendorID",
    "RateCodeID",
    "store_and_fwd_flag",
    "payment_type"
]

#remove nulls for cols used as features + label
colsToKeepNotNull = numericFeatures + categoricalFeatures + ["fare_amount"]

dfReg = dfReg.na.drop(subset=colsToKeepNotNull)

trainDf, testDf = dfReg.randomSplit([0.7, 0.3], seed=42)

dfReg.printSchema()
display(dfReg)


In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

#index categoricals
indexers = [
    StringIndexer(
        inputCol=c,
        outputCol=c + "_idx",
        handleInvalid="keep"
    )
    for c in categoricalFeatures
]

#one-hot encode
encoder = OneHotEncoder(
    inputCols=[c + "_idx" for c in categoricalFeatures],
    outputCols=[c + "_oh" for c in categoricalFeatures]
)

#assemble all features
assembler = VectorAssembler(
    inputCols=numericFeatures + [c + "_oh" for c in categoricalFeatures],
    outputCol="features_unscaled"
)

#scale for linear regression
#withMean=False keeps vector sparse to avoid huge memory usage
scaler = StandardScaler(
    inputCol="features_unscaled",
    outputCol="features",
    withMean=False, #True
    withStd=True
)

#linear regression model
lr = LinearRegression(
    featuresCol="features",
    labelCol="fare_amount",
    predictionCol="prediction"
)

#full linear regression pipeline
lrPipeline = Pipeline(
    stages=indexers + [encoder, assembler, scaler, lr]
)

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

#param grid
paramGrid = (
    ParamGridBuilder()
    .addGrid(lr.regParam, [0.0, 0.1]) # [0.0, 0.01, 0.1]
    .addGrid(lr.elasticNetParam, [0.0, 0.5]) # [0.0, 0.5, 1.0]
    .addGrid(lr.maxIter, [50]) # [50, 100]
    .build()
)

evaluatorRmse = RegressionEvaluator(
    labelCol="fare_amount",
    predictionCol="prediction",
    metricName="rmse"
)

cvLr = CrossValidator(
    estimator=lrPipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluatorRmse,
    numFolds=2,     #lower if needed
    parallelism=1,  #lower if needed
    collectSubModels=False
)

In [0]:
cvLrModel = cvLr.fit(trainDf)

bestLrPipelineModel = cvLrModel.bestModel
bestLrModel = bestLrPipelineModel.stages[-1]