## Instanciating Spark

Import the PySpark library and the `SparkSession` class

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").appName("rdds_example").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/29 09:29:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Loading raw data into a Spark DataFrame

In [3]:
df_green = spark.read.parquet("data/raw/green/*/*")

                                                                                

In [4]:
# We want to implement this query that we had used in the previouse session, but with RDDs

"""
SELECT
    EXTRACT(HOUR FROM lpep_pickup_datetime) AS hour,
    PULocationID AS zone,

    SUM(total_amount) as revenue,
    COUNT(1) as number_records
FROM green
WHERE lpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY 1,2
ORDER BY 1,2
"""

"\nSELECT\n    EXTRACT(HOUR FROM lpep_pickup_datetime) AS hour,\n    PULocationID AS zone,\n\n    SUM(total_amount) as revenue,\n    COUNT(1) as number_records\nFROM green\nWHERE lpep_pickup_datetime >= '2020-01-01 00:00:00'\nGROUP BY 1,2\nORDER BY 1,2\n"

In [5]:
# To show the underlying rdd of the dataframe we run the `.rdd` method
# And if we add the `.take(5)` method, it will return a list of rows in which the dataframe that we have is built
# on top of - `Row` is a special object that is used for building dataframes
df_green.rdd.take(5)

                                                                                

[Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2019, 12, 18, 15, 52, 30), lpep_dropoff_datetime=datetime.datetime(2019, 12, 18, 15, 54, 39), store_and_fwd_flag='N', RatecodeID=1.0, PULocationID=264, DOLocationID=264, passenger_count=5.0, trip_distance=0.0, fare_amount=3.5, extra=0.5, mta_tax=0.5, tip_amount=0.01, tolls_amount=0.0, ehail_fee=None, improvement_surcharge=0.3, total_amount=4.81, payment_type=1.0, trip_type=1.0, congestion_surcharge=0.0),
 Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 1, 0, 45, 58), lpep_dropoff_datetime=datetime.datetime(2020, 1, 1, 0, 56, 39), store_and_fwd_flag='N', RatecodeID=5.0, PULocationID=66, DOLocationID=65, passenger_count=2.0, trip_distance=1.28, fare_amount=20.0, extra=0.0, mta_tax=0.0, tip_amount=4.06, tolls_amount=0.0, ehail_fee=None, improvement_surcharge=0.3, total_amount=24.36, payment_type=1.0, trip_type=2.0, congestion_surcharge=0.0),
 Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 1, 0, 41, 38

## Converting Spark DataFrame into a RDD

In [6]:
# Select only columns that are needed for the query
rdd = df_green.select("lpep_pickup_datetime","PULocationID","total_amount").rdd

In [7]:
rdd.take(5)

[Row(lpep_pickup_datetime=datetime.datetime(2019, 12, 18, 15, 52, 30), PULocationID=264, total_amount=4.81),
 Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 1, 0, 45, 58), PULocationID=66, total_amount=24.36),
 Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 1, 0, 41, 38), PULocationID=181, total_amount=15.34),
 Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 1, 0, 52, 46), PULocationID=129, total_amount=25.05),
 Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 1, 0, 19, 57), PULocationID=210, total_amount=11.3)]

## Re-creating query output on RDD

In [8]:
# We want to apply the `WHERE` statement from the query - we use the `.filter()` method for RDDs
# The filter method applies a function and keeps only those that return True
from datetime import datetime as dt
rdd.filter(lambda row: row.lpep_pickup_datetime >= dt(2020,1,1)).take(1)

[Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 1, 0, 45, 58), PULocationID=66, total_amount=24.36)]

In [None]:
# So now the next step would be perform the `GROUP BY` statement - to do so we need to apply two methods.
# First, we need to use the `map()` method to create key-value pairs, we use the transform function to do so.
# Next, we want to reduce the number of records by the key and aggregate the values such that there is only one key
# and a corresponding aggregated values in the final output

def transform(row):
    hour = row.lpep_pickup_datetime.hour
    zone = row.PULocationID

    revenue = row.total_amount
    count = 1

    # Return key-value pair
    return ((hour,zone), (revenue, count))

"""
The reduceByKey() method in Spark groups records by their key and applies the provided
function to aggregate values associated with the same key. The function operates pairwise, 
meaning it processes records iteratively rather than all at once.

Aggregation is done in a distributed and efficient manner. Spark applies the function to pairs of values 
first within each partition, then merges the intermediate results across partitions.

The function takes in two arguments at a time: the previously aggregated value and the next value, 
accumulating the result iteratively.
"""
def calculate(left_value, right_value):

    left_amount, left_count = left_value
    right_amount, right_count = right_value

    output_amount = left_amount + right_amount
    output_count = left_count + right_count

    return (output_amount, output_count)
    
# Now to see the output of records after mapping
rdd.filter(lambda row: row.lpep_pickup_datetime >= dt(2020,1,1)) \
    .map(transform).reduceByKey(calculate).take(10)

                                                                                

[((0, 66), (11809.39999999999, 557)),
 ((0, 210), (3354.7899999999972, 183)),
 ((1, 225), (2814.3199999999997, 112)),
 ((0, 82), (32809.18999999952, 2558)),
 ((0, 74), (37676.54999999988, 2704)),
 ((0, 134), (8018.680000000015, 597)),
 ((0, 42), (32414.179999999942, 2258)),
 ((0, 166), (15045.050000000041, 948)),
 ((0, 22), (1592.9499999999998, 58)),
 ((0, 130), (18727.24999999998, 1108))]

## Unpacking the RDD into DataFrame

In [11]:
# Now that we have recreated our query output in RDD, we have to now unpack it into a DataFrame 
from collections import namedtuple

# We need to perform this step as if we converted our output directly into a DF, 
# the shcema would not be there as well as column names
RevenueRow = namedtuple('RevenueRow', ['hour', 'zone', 'revenue', 'count'])

In [15]:
# Helper function to help unpack the rdd
def unwrap(row):
    return RevenueRow(hour=row[0][0],
                      zone=row[0][1], 
                      revenue=row[1][0],
                      count=row[1][1]
                      )

df_result = rdd.filter(
    lambda row: row.lpep_pickup_datetime >= dt(2020,1,1)).map(transform).reduceByKey(calculate).map(unwrap).toDF()

                                                                                

In [17]:
df_result.schema

StructType([StructField('hour', LongType(), True), StructField('zone', LongType(), True), StructField('revenue', DoubleType(), True), StructField('count', LongType(), True)])

In [18]:
df_result.show()

+----+----+------------------+-----+
|hour|zone|           revenue|count|
+----+----+------------------+-----+
|   0|  66| 11809.39999999999|  557|
|   0| 210|3354.7899999999972|  183|
|   1| 225|2814.3199999999997|  112|
|   0|  82| 32809.18999999952| 2558|
|   0|  74| 37676.54999999988| 2704|
|   0| 134| 8018.680000000015|  597|
|   0|  42|32414.179999999942| 2258|
|   0| 166|15045.050000000041|  948|
|   0|  22|1592.9499999999998|   58|
|   0| 130| 18727.24999999998| 1108|
|   0| 226|11961.250000000007|  447|
|   0| 146|           5843.85|  281|
|   0| 190|            279.76|   18|
|   1| 145| 3621.969999999998|  181|
|   0|  70|            633.94|   30|
|   1|  25| 8350.610000000017|  486|
|   1|  49| 2498.299999999998|  145|
|   1|  41|22661.169999999925| 1706|
|   0| 198|            786.21|   28|
|   0| 258|            548.23|   21|
+----+----+------------------+-----+
only showing top 20 rows



In [19]:
df_result.write.parquet("tmp/green_revenue", mode='overwrite')

                                                                                