In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("spark-assignment").getOrCreate()

In [2]:
df = spark.read.csv("work/2019-04.csv", header=True, inferSchema=True)

In [3]:
df_selected = df.select("passenger_count", "pulocationid", "dolocationid", "total_amount")
df_selected.show(10)

+---------------+------------+------------+------------+
|passenger_count|pulocationid|dolocationid|total_amount|
+---------------+------------+------------+------------+
|            1.0|       239.0|       239.0|         8.8|
|            1.0|       230.0|       100.0|         8.3|
|            1.0|        68.0|       127.0|       47.75|
|            1.0|        68.0|        68.0|         7.3|
|            1.0|        50.0|        42.0|       23.15|
|            1.0|        95.0|       196.0|         9.8|
|            1.0|       211.0|       211.0|         6.8|
|            1.0|       237.0|       162.0|         7.8|
|            1.0|       148.0|        37.0|        20.3|
|            1.0|       265.0|       265.0|        0.31|
+---------------+------------+------------+------------+
only showing top 10 rows



In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline

In [5]:
assembler = VectorAssembler(
    inputCols=["passenger_count", "pulocationid", "dolocationid"],
    outputCol="features"
)

#df_selected = assembler.transform(df_selected)

In [6]:
trainDF, testDF = df_selected.randomSplit([0.85, 0.15], seed=42)

In [10]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="total_amount",
                    maxDepth=30,maxBins=50)

pipeline = Pipeline(stages=[assembler, dt])

In [14]:
model = pipeline.fit(trainDF)
predictions = model.transform(testDF)

In [15]:

predictions.select("passenger_count", "pulocationid", "dolocationid", "total_amount", "prediction").show(10)

+---------------+------------+------------+------------+------------------+
|passenger_count|pulocationid|dolocationid|total_amount|        prediction|
+---------------+------------+------------+------------+------------------+
|            0.0|         4.0|        49.0|        26.0|            22.425|
|            0.0|         4.0|        75.0|       28.55|34.815999999999995|
|            0.0|         4.0|       234.0|        11.0|15.873333333333335|
|            0.0|         7.0|       193.0|        7.55| 14.15060606060606|
|            0.0|        10.0|       164.0|       64.42|31.207499999999996|
|            0.0|        10.0|       186.0|       76.77|21.299999999999997|
|            0.0|        12.0|        13.0|         9.3|14.381250000000001|
|            0.0|        12.0|        50.0|        21.3|            22.425|
|            0.0|        12.0|       234.0|        25.3|15.873333333333335|
|            0.0|        13.0|        13.0|         8.8|16.733469387755104|
+-----------

In [16]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="total_amount",
    predictionCol="prediction",
    metricName="rmse"
)

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 9.994177702973076
