# Part 3: Machine Learning

##### Data Preparation for Modelling

In [0]:
# Read the combined data from dbfs
combined_data = spark.read.parquet("/dbfs/combined_yellow_green_dataset")

In [0]:
# Check the first 5 rows of the combined_data_view
combined_data.show(5)

+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+----------------+-------------------+------------------+----------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|duration_seconds|     duration_hours|         speed_mph|taxi_color|
+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+----------------+-------------------+------------------+----------+
|       1|2019-03-01 00:25:27|2

In [0]:
# Create a final dataframe for modelling containing data until March 2022
final_model_data = combined_data.where(combined_data.pickup_datetime < '2022-04-01 00:00:00')

# Put the data for April 2022 in a separate dataframe which will be used to test the ultimately trained model
apr_2022_data = combined_data.where(combined_data.pickup_datetime >= '2022-04-01 00:00:00')

In [0]:
# Check the schema of final_model_data
final_model_data.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- duration_seconds: long (nullable = true)
 |-- duration_hours: double (nullable = true)
 |-- speed_mph: double (nullable = true)
 |-- taxi_color: string (nullable = true)



In [0]:
# Creating a list containing the column names
cols_list = ["VendorID", "pickup_datetime", "dropoff_datetime", "passenger_count", "trip_distance", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "duration_seconds", "duration_hours", "speed_mph", "taxi_color", "total_amount"]

In [0]:
# Creating an empty list called stages
stages = []

In [0]:
# Dropping rows with NA values in the desired features
final_model_data = final_model_data.na.drop(subset=["trip_distance", "duration_hours", "RatecodeID", "PULocationID", "DOLocationID", "taxi_color"])

In [0]:
# Import OneHotEncoder, StringIndexer, VectorAssembler from pyspark.ml.feature 
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
# Instantiate StringIndexer and OneHotEncoder for taxi_color variable and add them to stages
col_indexer = StringIndexer(inputCol="taxi_color", outputCol="taxi_color_ind")
col_encoder = OneHotEncoder(inputCols=["taxi_color_ind"], outputCols=["taxi_color_ohe"])
stages += [col_indexer, col_encoder]

In [0]:
# Importing VectorAssembler
from pyspark.ml.feature import VectorAssembler
# Instantiate a VectorAssembler with the desired features
vector_assembler = VectorAssembler(inputCols = ["trip_distance", "duration_hours", "RatecodeID", "PULocationID", "DOLocationID", "taxi_color_ohe"], outputCol = "features")

In [0]:
# Add vector_assembler to stages
stages += [vector_assembler]

In [0]:
# Import Pipeline from pyspark.ml
from pyspark.ml import Pipeline
# Instantiate a Pipeline with stages
pipeline = Pipeline(stages=stages)

In [0]:
# Fit the pipeline final_model_data
pipeline_model = pipeline.fit(final_model_data)

In [0]:
# Apply the pipeline to final_model_data
final_model_data = pipeline_model.transform(final_model_data)

In [0]:
# Select the features column, target variable and all the rest of the variables
final_model_data = final_model_data.select(['features'] + cols_list)
final_model_data.show()

+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+-------------------+------------------+----------+------------+
|            features|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|duration_seconds|     duration_hours|         speed_mph|taxi_color|total_amount|
+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+-------------------+------------------+----------+------------+
|[3.7,0.1861111111...|       1|

In [0]:
# Creating two copies of the final_model_data for the two ML models
final_model_data_lr = final_model_data
final_model_data_dtr = final_model_data

##### Linear Regression Model

In [0]:
# Split the final_model_data_lr into train and test sets with a 75-25 ratio
train_data, test_data = final_model_data_lr.randomSplit([0.75, 0.25], seed=8)

In [0]:
# Import LinearRegression from pyspark.ml.regression
from pyspark.ml.regression import LinearRegression

# Fit a linear regression model to the train data 
linearRegressor = LinearRegression(featuresCol = 'features', labelCol = 'total_amount')
lr_model = linearRegressor.fit(train_data)

In [0]:
# Get the summary of the linear regression model
lr_model_summary = lr_model.summary

In [0]:
# Retrieve the rmse of the predictions on the train data
train_rmse = lr_model_summary.rootMeanSquaredError

In [0]:
# Print the rmse of the predictions on the train data
print(train_rmse)

145.3205340692088


In [0]:
# Use the trained linear regression model to make predictions on the training set
lr_model_train_preds = lr_model.transform(train_data)

In [0]:
# Print the first 5 rows of the training set with the predictions
lr_model_train_preds.show(5)

+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+--------------------+------------------+----------+------------+------------------+
|            features|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|duration_seconds|      duration_hours|         speed_mph|taxi_color|total_amount|        prediction|
+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+--------------------+------------------+----------+----

In [0]:
# Use the trained linear regression model to make predictions on the test data
lr_model_test_preds = lr_model.transform(test_data)

In [0]:
# Print the first 5 rows of the test set with the predictions
lr_model_test_preds.show(5)

+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+--------------------+------------------+----------+------------+------------------+
|            features|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|duration_seconds|      duration_hours|         speed_mph|taxi_color|total_amount|        prediction|
+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+--------------------+------------------+----------+----

In [0]:
# Retrieve the rmse of the predictions made on the test set
lr_model_test_eval = lr_model.evaluate(test_data)
test_rmse = lr_model_test_eval.rootMeanSquaredError
print(test_rmse)

175.8268423362161


##### Decision Tree Regression model

In [0]:
# Split the final_model_data_dtr into train and test sets with a 75-25 ratio
train_data, test_data = final_model_data_dtr.randomSplit([0.75, 0.25], seed=8)

In [0]:
# Import DecisionTreeRegressor from pyspark.ml.regression
from pyspark.ml.regression import DecisionTreeRegressor
# Fit a Decision Tree Regression model to the training set
decisionTreeRegressor = DecisionTreeRegressor(featuresCol = 'features', labelCol = 'total_amount')
dtr_model = decisionTreeRegressor.fit(train_data)

In [0]:
# Use the trained decision tree regression model to make predictions on the training set
dtr_model_train_preds = dtr_model.transform(train_data)
# Print the first 5 rows of the training set with the predictions
dtr_model_train_preds.show(5)

+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+--------------------+------------------+----------+------------+-----------------+
|            features|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|duration_seconds|      duration_hours|         speed_mph|taxi_color|total_amount|       prediction|
+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+--------------------+------------------+----------+------

In [0]:
# Import RegressionEvaluator from pyspark.ml.evaluation
from pyspark.ml.evaluation import RegressionEvaluator
# Instantiate a RegressionEvaluator with rmse as the metric
dtr_train_evaluator = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="rmse")
# Retrieve the rmse of the predictions made on the training set
train_rmse = dtr_train_evaluator.evaluate(dtr_model_train_preds)
print(train_rmse)

145.31593906253318


In [0]:
# Use the trained decision tree regression model to make predictions on the test data
dtr_model_test_preds = dtr_model.transform(test_data)
# Print the first 5 rows of the training set with the predictions
dtr_model_test_preds.show(5)

+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+--------------------+------------------+----------+------------+-----------------+
|            features|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|duration_seconds|      duration_hours|         speed_mph|taxi_color|total_amount|       prediction|
+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+--------------------+------------------+----------+------

In [0]:
# Instantiate a RegressionEvaluator with rmse as the metric
dtr_test_evaluator = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="rmse")
# Retrieve the rmse of the predictions made on the test set
test_rmse = dtr_test_evaluator.evaluate(dtr_model_test_preds)
print(test_rmse)

175.82357761288364


##### Prediction on April 2022 data

In [0]:
# Creating an empty list called stages
stages = []

In [0]:
# Dropping rows with NA values in the desired features
apr_2022_data = apr_2022_data.na.drop(subset=["trip_distance", "duration_hours", "RatecodeID", "PULocationID", "DOLocationID", "taxi_color"])

In [0]:
# Import OneHotEncoder, StringIndexer, VectorAssembler from pyspark.ml.feature 
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
# Instantiate StringIndexer and OneHotEncoder for taxi_color variable and add them to stages
col_indexer = StringIndexer(inputCol="taxi_color", outputCol="taxi_color_ind")
col_encoder = OneHotEncoder(inputCols=["taxi_color_ind"], outputCols=["taxi_color_ohe"])
stages += [col_indexer, col_encoder]

In [0]:
# Importing VectorAssembler
from pyspark.ml.feature import VectorAssembler
# Instantiate a VectorAssembler with the desired features
vector_assembler = VectorAssembler(inputCols = ["trip_distance", "duration_hours", "RatecodeID", "PULocationID", "DOLocationID", "taxi_color_ohe"], outputCol = "features")

In [0]:
# Add vector_assembler to stages
stages += [vector_assembler]

In [0]:
# Import Pipeline from pyspark.ml
from pyspark.ml import Pipeline
# Instantiate a Pipeline with stages
pipeline = Pipeline(stages=stages)

In [0]:
# Fit the pipeline apr_2022_data
pipeline_model = pipeline.fit(apr_2022_data)

In [0]:
# Apply the pipeline to apr_2022_data
apr_2022_data = pipeline_model.transform(apr_2022_data)

In [0]:
# Select the features column, target variable and all the rest of the variables
apr_2022_data = apr_2022_data.select(['features'] + cols_list)
apr_2022_data.show()

+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+-------------------+------------------+----------+------------+
|            features|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|duration_seconds|     duration_hours|         speed_mph|taxi_color|total_amount|
+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+-------------------+------------------+----------+------------+
|[2.26,0.172777777...|       2|

In [0]:
# Use the trained decision tree regression model to make predictions on the apr_2022_data
dtr_model_apr_2022_preds = dtr_model.transform(apr_2022_data)
# Print the first 5 rows of the training set with the predictions
dtr_model_apr_2022_preds.show(5)

+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+-------------------+------------------+----------+------------+------------------+
|            features|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|duration_seconds|     duration_hours|         speed_mph|taxi_color|total_amount|        prediction|
+--------------------+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+----------------+-------------------+------------------+----------+-------

In [0]:
# Instantiate a RegressionEvaluator with rmse as the metric
dtr_test_evaluator = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="rmse")
# Retrieve the rmse of the predictions made on the apr_2022_data
test_apr_2022_rmse = dtr_test_evaluator.evaluate(dtr_model_apr_2022_preds)
print(test_apr_2022_rmse)

6.471573438007627
