In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, minute, dayofweek, month, sqrt, pow

from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import RegressionMetrics

spark = SparkSession.builder\
                    .appName("NYCTaxiTripDurationRegression")\
                    .master("local[*]")\
                    .config("spark.log.level", "ERROR")\
                    .getOrCreate()
sc = spark.sparkContext

your 131072x1 screen size is bogus. expect trouble


25/04/08 12:22:38 WARN Utils: Your hostname, jztr resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/08 12:22:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/04/08 12:22:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/08 12:22:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/08 12:22:39 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# Data preparation

In [4]:
data = spark.read.csv("../../../data/train.csv", header=True, inferSchema=True)
data.show(5)

                                                                                

+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|       id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|id2875421|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1| -73.9821548461914| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|
|id2377394|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423828|40.738563537597656|-73.99948120117188| 40.73115158081055|                 N|          663|
|id3858529|        2|2016-01-19 11:35:24|2016-01-19 12:10:48|    

# Data preprocessing

- **Columns**:
  - `id`: Unique identifier for each trip.
  - `vendor_id`: ID of the taxi vendor.
  - `pickup_datetime` and `dropoff_datetime`: Timestamps for the start and end of the trip.
  - `passenger_count`: Number of passengers in the taxi.
  - `pickup_longitude` and `pickup_latitude`: GPS coordinates of the pickup location.
  - `dropoff_longitude` and `dropoff_latitude`: GPS coordinates of the dropoff location.
  - `store_and_fwd_flag`: Whether the trip record was held in the vehicle's memory before sending to the server (`Y` or `N`).
  - `trip_duration`: Duration of the trip in seconds.

1. **Feature Extraction**:
  - Extracted additional features such as:
    - `pickup_minutes`: Total minutes from the start of the day.
    - `pickup_dayofweek`: Day of the week.
    - `pickup_month`: Month of the year.
    - `distance`: Euclidean distance between pickup and dropoff locations.

2. **Filtering Invalid Data**:
  - Removed trips with:
    - `passenger_count` less than or equal to 0.
    - `trip_duration` greater than 22 hours (extreme outliers).
    - `distance` less than or equal to 0.

3. **Feature Assembly**:
  - Combined relevant features into a single vector using `VectorAssembler`. The selected features include:
    - `passenger_count`
    - `pickup_longitude`
    - `pickup_latitude`
    - `distance`
    - `pickup_minutes`
    - `pickup_dayofweek`
    - `pickup_month`

4. **Data Transformation**:
  - Transformed the data into a format suitable for machine learning by creating a `features` column and retaining the target variable `trip_duration`.


In [5]:
data = data.withColumn("pickup_minutes", hour("pickup_datetime") * 60 + minute("pickup_datetime")) \
            .withColumn("pickup_dayofweek", dayofweek("pickup_datetime")) \
            .withColumn("pickup_month", month("pickup_datetime")) \
            .withColumn("distance", sqrt(
                pow(data["pickup_longitude"] - data["dropoff_longitude"], 2) +
                pow(data["pickup_latitude"] - data["dropoff_latitude"], 2)
            ))

data = data.filter("passenger_count > 0") \
            .filter("trip_duration < 22 * 3600") \
            .filter("distance > 0")

In [6]:
from pyspark.ml.feature import VectorAssembler

feature_columns = [
    "passenger_count",
    "pickup_longitude",
    "pickup_latitude",
    "distance",
    "pickup_minutes",
    "pickup_dayofweek",
    "pickup_month",
]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled_data = assembler.transform(data)

assembled_data = assembled_data.select("features", "trip_duration")

In [7]:
# Convert the features column  and target variable into RDD of LabeledPoint
rdd_data = assembled_data.rdd.map(lambda row: LabeledPoint(row.trip_duration, DenseVector(row.features.values)))
rdd_data.cache()

print(rdd_data.take(1))

[Stage 3:>                                                          (0 + 1) / 1]

25/04/08 12:22:50 WARN BlockManager: Task 18 already completed, not releasing lock for rdd_20_0
[LabeledPoint(455.0, [1.0,-73.9821548461914,40.76793670654297,0.01767953949959892,1044.0,2.0,3.0])]


                                                                                

# Model Training:
  - Split the data into training and testing sets.
  - Train a Decision Tree Regressor using the training data.

In [8]:
# Split data into training and testing sets
train_rdd, test_rdd = rdd_data.randomSplit([0.8, 0.2], seed=42)

# Train a decision tree regressor model
model = DecisionTree.trainRegressor(
    train_rdd, 
    categoricalFeaturesInfo={},
    maxDepth=10,
    minInstancesPerNode=5
)

25/04/08 12:22:50 WARN BlockManager: Task 19 already completed, not releasing lock for rdd_20_0


                                                                                

# Model Evaluation:
  - Evaluate the model's performance on the test data using metrics such as RMSE and MAE.

In [9]:
predictions = model.predict(test_rdd.map(lambda x: x.features))
predictions_and_labels = predictions.zip(test_rdd.map(lambda lp: lp.label))

# Use RegressionMetrics to evaluate the model
metrics = RegressionMetrics(predictions_and_labels)

# Print evaluation metrics
print("Root Mean Squared Error (RMSE):", metrics.rootMeanSquaredError)
print("Mean Absolute Error (MAE):", metrics.meanAbsoluteError)



Root Mean Squared Error (RMSE): 587.1664523389743
Mean Absolute Error (MAE): 234.04290512445456


                                                                                