# Data Preprocessing for ML Training
Melissa Putri (1389438)

Outputs final preprocessed data ready for ML training.

In [1]:
# Import Libraries, Functions, and Start Spark session

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import count, col, hour, month, year, dayofweek, avg, udf
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor, LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


spark = (
    SparkSession.builder.appName("Data Preprocessing 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Increase driver memory
    .config("spark.executor.memory", "8g")  # Increase executor memory
    .config("spark.executor.instances", "4")  # Increase the number of executor instances
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()
)

24/08/23 19:13:40 WARN Utils: Your hostname, Melissas-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 192.168.0.3 instead (on interface en0)
24/08/23 19:13:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/23 19:13:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Load dataframe, using scaled version
uber_data = spark.read.parquet('../data/curated/Uber_Scaled/')
lyft_data = spark.read.parquet('../data/curated/Lyft_Scaled/')


                                                                                

## Preparing Dataset for ML Training

In [3]:
# Aggregate data to count the number of trips per hour

def aggregate_demand_data(dataframe, service_name):
    """
    Aggregates demand data for the specified service (Uber or Lyft).
    
    :param dataframe: The PySpark DataFrame containing the data.
    :param service_name: A string representing the service name ("Uber" or "Lyft").
    :return: Aggregated DataFrame with trip counts, average trip duration, prices, and weather information.
    """
    aggregated_data = dataframe.groupBy('year', 'month', 'day_of_week', 'hour').agg(
        F.count('*').alias('trip_count'),
        F.avg('trip_duration_min').alias('avg_trip_duration'),
        F.avg('total_price').alias('avg_total_price'),
        F.avg('avg_temp').alias('avg_temp'),                  # Average temperature during that hour
        F.avg('avg_max_temp').alias('avg_max_temp'),          # Average max temperature during that hour
        F.avg('avg_min_temp').alias('avg_min_temp'),          # Average min temperature during that hour
        F.avg('total_precipitation').alias('total_precipitation'),  # Total precipitation during that hour
        F.avg('average_wind').alias('average_wind')           # Average wind speed during that hour
    ).withColumn('service_name', F.lit(service_name))  # Add a column for the service name
    
    return aggregated_data


uber_demand = aggregate_demand_data(uber_data,"Uber")
lyft_demand = aggregate_demand_data(lyft_data, "Lyft")

In [4]:
# Encode categorical variables
indexers = [
    StringIndexer(inputCol='year', outputCol='year_index'),
    StringIndexer(inputCol='month', outputCol='month_index'),
    StringIndexer(inputCol='day_of_week', outputCol='day_of_week_index')
]

# One-hot encode the indexed columns
encoders = [
    OneHotEncoder(inputCol='year_index', outputCol='year_vec'),
    OneHotEncoder(inputCol='month_index', outputCol='month_vec'),
    OneHotEncoder(inputCol='day_of_week_index', outputCol='day_of_week_vec')
]

# Vector Assembler
assembler = VectorAssembler(
    inputCols=[
        'hour', 
        'year_vec', 
        'month_vec', 
        'day_of_week_vec', 
        'avg_trip_duration', 
        'avg_total_price',
        'avg_temp',              
        'avg_max_temp',
        'avg_min_temp',
        'total_precipitation',
        'average_wind'
    ],
    outputCol='features'
)

# Define the pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler])

# Fit and transform the data
pipeline_uber = pipeline.fit(uber_demand)
uber_prep = pipeline_uber.transform(uber_demand)

pipeline_lyft = pipeline.fit(lyft_demand)
lyft_prep = pipeline_lyft.transform(lyft_demand)

                                                                                

In [5]:
# Select final columns for modeling
uber_final = uber_prep.select('features', 'trip_count','hour')

# Split the data into training and testing sets
uber_train, uber_test = uber_final.randomSplit([0.8, 0.2], seed=42)

# Show the schema and the first few rows of the final prepared data
uber_train.printSchema()
uber_train.show()

root
 |-- features: vector (nullable = true)
 |-- trip_count: long (nullable = false)
 |-- hour: integer (nullable = true)



24/08/23 19:13:52 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors

+--------------------+----------+----+
|            features|trip_count|hour|
+--------------------+----------+----+
|(26,[0,1,3,13,19,...|     81116|   1|
|(26,[0,1,3,13,19,...|     59006|   2|
|(26,[0,1,3,13,19,...|     30635|   4|
|(26,[0,1,3,13,19,...|     23586|   5|
|(26,[0,1,3,13,19,...|     26571|   6|
|(26,[0,1,3,13,19,...|     41247|   8|
|(26,[0,1,3,13,19,...|     64578|  10|
|(26,[0,1,3,13,19,...|     70653|  11|
|(26,[0,1,3,13,19,...|     71395|  12|
|(26,[0,1,3,13,19,...|     72254|  13|
|(26,[0,1,3,13,19,...|     74683|  15|
|(26,[0,1,3,13,19,...|     73745|  16|
|(26,[0,1,3,13,19,...|     75344|  17|
|(26,[0,1,3,13,19,...|     80151|  18|
|(26,[0,1,3,13,19,...|     78715|  19|
|(26,[0,1,3,13,19,...|     73673|  21|
|(26,[0,1,3,13,19,...|     68437|  22|
|(26,[0,1,3,13,19,...|     56780|  23|
|(26,[0,1,3,14,19,...|     14869|   2|
|(26,[0,1,3,14,19,...|     11443|   3|
+--------------------+----------+----+
only showing top 20 rows



                                                                                

In [6]:
# Select final columns for modeling
lyft_final = lyft_prep.select('features', 'trip_count','hour')

# Split the data into training and testing sets
lyft_train, lyft_test = lyft_final.randomSplit([0.8, 0.2], seed=42)

# Show the schema and the first few rows of the final prepared data
lyft_train.printSchema()
lyft_train.show()

root
 |-- features: vector (nullable = true)
 |-- trip_count: long (nullable = false)
 |-- hour: integer (nullable = true)





+--------------------+----------+----+
|            features|trip_count|hour|
+--------------------+----------+----+
|(26,[0,1,3,13,19,...|     38009|   1|
|(26,[0,1,3,13,19,...|     32086|   2|
|(26,[0,1,3,13,19,...|     17659|   4|
|(26,[0,1,3,13,19,...|     12294|   5|
|(26,[0,1,3,13,19,...|     13142|   6|
|(26,[0,1,3,13,19,...|     15012|   8|
|(26,[0,1,3,13,19,...|     22269|  10|
|(26,[0,1,3,13,19,...|     24285|  11|
|(26,[0,1,3,13,19,...|     24782|  12|
|(26,[0,1,3,13,19,...|     25910|  13|
|(26,[0,1,3,13,19,...|     26189|  15|
|(26,[0,1,3,13,19,...|     25745|  16|
|(26,[0,1,3,13,19,...|     27070|  17|
|(26,[0,1,3,13,19,...|     30094|  18|
|(26,[0,1,3,13,19,...|     30876|  19|
|(26,[0,1,3,13,19,...|     25002|  21|
|(26,[0,1,3,13,19,...|     24046|  22|
|(26,[0,1,3,13,19,...|     20763|  23|
|(26,[0,1,3,14,19,...|      6838|   2|
|(26,[0,1,3,14,19,...|      5622|   3|
+--------------------+----------+----+
only showing top 20 rows



                                                                                

# Export Preprocessed Data for Model Training

In [7]:
uber_train.write.parquet('../data/curated/Uber_Train')
uber_test.write.parquet('../data/curated/Uber_Test')
lyft_train.write.parquet('../data/curated/Lyft_Train')
lyft_test.write.parquet('../data/curated/Lyft_Test')

                                                                                

24/08/23 20:29:48 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 908970 ms exceeds timeout 120000 ms
24/08/23 20:29:48 WARN SparkContext: Killing executors is not supported by current scheduler.
24/08/23 20:29:48 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$