In [1]:
import pyspark
from pyspark.sql import SparkSession

# instantiate a Spark session, an object that we use to interact with Spark
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [2]:
# get our data
df_result = spark.read.parquet('tmp/green-revenue')

df_result.show(5)

+-------------------+----+-----------------+-----+
|               hour|zone|          revenue|count|
+-------------------+----+-----------------+-----+
|2020-01-18 11:00:00| 128|            33.51|    1|
|2020-01-08 20:00:00| 197|             10.3|    1|
|2020-01-02 22:00:00|  82|261.4800000000001|   22|
|2020-01-21 18:00:00| 220|           118.56|    2|
|2020-01-22 11:00:00| 257|           141.82|    3|
+-------------------+----+-----------------+-----+
only showing top 5 rows



In [3]:
# read in green data
df_green = spark.read.parquet('data/parquet/green/*/*')

**`.mapPartitions()` function returns a *new* RDD by applying a function to *each partition* of a provided RDD**
- `.mapPartitions()` receives a data partition (via an RDD) as input, applies some function to that data, and produces *another* partition as output (via an RDD)
     - `.mapPartitions()` is *exactly* the same as `.map()`, with the difference being that `.mapPartitions()` provides a facility to do heavy initializations (for example, Database connections) *once for each partition* instead of doing it on *every DataFrame row*
         - This helps the performance of a job when dealing with heavy-weighted initialization on larger datasets
     - See "Spark `.map()` vs `.mapPartitions()` with Examples" for more: https://sparkbyexamples.com/spark/spark-map-vs-mappartitions-transformation/
- Suppose we have a 1TB dataset partitioned into chunks of 100MB
    - We want to process each chunk and produce *another* partition with *processed* data
    - Many applications benefit from this sequence of steps, such as ML
    - Suppose we have a trained ML model that we put *inside* the .`mapPartition()` function
    - Then, Spark will **chunk** a large dataset into smaller partitions and apply our model *to each partition*, outputting its predictions on said partition

**As a first example/use case, we implement a simple method to count the number of rows in each partition, to illustrate how to use `.mapPartitions()`**

In [4]:
def count_rows(partition):
    count = 0
    for row in partition:
        count += 1
    return [count]

In [5]:
columns = ['VendorID', 'lpep_pickup_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']

duration_rdd = df_green \
    .select(columns) \
    .rdd

duration_rdd.mapPartitions(count_rows).collect()

[447770,
 454484,
 238894,
 154607,
 154692,
 149681,
 149463,
 144966,
 135114,
 117302,
 107592,
 49952]

**We can also reimplement `count_rows()` using `pandas`**
- Note that although a `pandas.DataFrame` is simpler to manipulate, this approach *materializes the entire DataFrame in memory*

In [6]:
import pandas as pd

def count_rows_using_pandas(partition):
    df = pd.DataFrame(partition, columns=columns)
    
    return [len(df)]

duration_rdd.mapPartitions(count_rows_using_pandas).collect()

[447770,
 454484,
 238894,
 154607,
 154692,
 149681,
 149463,
 144966,
 135114,
 117302,
 107592,
 49952]

**See that we get the same results**

**Now, let's suppose we have a simple ML model that predicts trip duration as trip distance multiplied by 5 (i.e., we will create an application that predict the duration of a trips)**

**First, select the necessary columns and turn this to a RDD so we can apply our upcoming model to each partition**

In [7]:
# select the necessary columns and turn the DataFrame to a RDD
columns = ['VendorID', 'lpep_pickup_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']

duration_rdd = df_green \
    .select(columns) \
    .rdd

duration_rdd.take(3)

[Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 22, 13, 18, 32), PULocationID=244, DOLocationID=41, trip_distance=5.22),
 Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 17, 54, 10), PULocationID=236, DOLocationID=263, trip_distance=0.87),
 Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 19, 10, 23, 37), PULocationID=166, DOLocationID=166, trip_distance=0.63)]

In [8]:
# # The code below returns [1] for each partitions and flattens the list
# def apply_model_in_batch(partition):
#     return [1]

# duration_rdd.mapPartitions(apply_model_in_batch).collect()
# # [1, 1, 1, 1]  # four partitions

In [9]:
# The code below returns the number of rows per for each partition and flattens the list
def apply_model_in_batch(partition):
    cnt = 0

    for row in partition:
        cnt = cnt + 1

    return [cnt]

duration_rdd.mapPartitions(apply_model_in_batch).collect()

[447770,
 454484,
 238894,
 154607,
 154692,
 149681,
 149463,
 144966,
 135114,
 117302,
 107592,
 49952]

**We see that our partitions are not really balanced. 2 (maybe 3) partitions are very large compared to others**

**Let's turn partitions to a `pandas` DataFrame**

In [10]:
rows = duration_rdd.take(10)

pd.DataFrame(rows, columns=columns)

Unnamed: 0,VendorID,lpep_pickup_datetime,PULocationID,DOLocationID,trip_distance
0,2.0,2020-01-22 13:18:32,244,41,5.22
1,2.0,2020-01-23 17:54:10,236,263,0.87
2,2.0,2020-01-19 10:23:37,166,166,0.63
3,2.0,2020-01-21 14:25:16,152,238,2.71
4,,2020-01-07 09:46:00,51,3,2.13
5,,2020-01-10 08:19:00,32,186,17.93
6,2.0,2020-01-15 10:57:40,130,228,24.04
7,2.0,2020-01-26 16:52:02,7,223,1.45
8,2.0,2020-01-24 21:57:19,244,87,10.94
9,2.0,2020-01-27 23:45:54,65,62,3.18


In [11]:
def apply_model_in_batch(rows):
    # WARNING: the immediate below line materializes the entire DataFrame in memory
    df = pd.DataFrame(rows, columns=columns)
    
    cnt = len(df)
    
    return [cnt]

duration_rdd.mapPartitions(apply_model_in_batch).collect()

[447770,
 454484,
 238894,
 154607,
 154692,
 149681,
 149463,
 144966,
 135114,
 117302,
 107592,
 49952]

**The above put the entire partition in a DataFrame, *which isn’t always good***

**If you want to solve it, you can use `.islice()` to break a partition into 100,000-row subpartitions and treat them them separately**

**Anyway, now, we are ready to apply a ML model**

**Normally we'd have a model that calculates predictions from an algorithm + data from a DataFrame. But since we don’t have a model yet, let’s calculate an arbitrary duration (5 minutes per km)**

In [12]:
def model_predict(df):
    #y_pred = model.predict(df)
    y_pred = df.trip_distance * 5
    
    return y_pred

**We can also use `itertuples()` to iterate over DataFrame rows as namedtuples. For illustrative purposes:**

In [16]:
rows = duration_rdd.take(5)
df = pd.DataFrame(rows, columns=columns)
list(df.itertuples())

[Pandas(Index=0, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-22 13:18:32'), PULocationID=244, DOLocationID=41, trip_distance=5.22),
 Pandas(Index=1, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-23 17:54:10'), PULocationID=236, DOLocationID=263, trip_distance=0.87),
 Pandas(Index=2, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-19 10:23:37'), PULocationID=166, DOLocationID=166, trip_distance=0.63),
 Pandas(Index=3, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-21 14:25:16'), PULocationID=152, DOLocationID=238, trip_distance=2.71),
 Pandas(Index=4, VendorID=nan, lpep_pickup_datetime=Timestamp('2020-01-07 09:46:00'), PULocationID=51, DOLocationID=3, trip_distance=2.13)]

**The `yield` keyword in Python is similar to a `return` statement used for returning values or objects in Python**

**However, there is a *slight* difference: The `yield` statement returns a `generator` object to the one who calls the function which contains `yield`, instead of simply returning a value. For example:**

In [17]:
def infinite_seq(flag: bool):
    i = 0
    
    while True:
        yield i
        i = i + 1

        if flag and i > 10:
            break
            
# produce an infinite sequence            
seq = infinite_seq(False)
seq

<generator object infinite_seq at 0x000002CA0CF69BA0>

In [19]:
# produces a finite sequence
seq = infinite_seq(True)
# seq
list(seq)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

**So, `yield` writes each `Row` to the resulting RDD and then it will flatten it**

**We can use these (`yield` and `.itertuples()` in our ML model batch processing**

In [20]:
def apply_model_in_batch(rows):
    df = pd.DataFrame(rows, columns=columns)
    
    # do our "prediction"
    predictions = model_predict(df)
    
    df['predicted_duration'] = predictions
    
    # iterate over the DataFrame rows as tuples
    for row in df.itertuples():
        # yield (return) the row
        yield row

In [14]:
# create the predictions
df_predictions = duration_rdd \
    .mapPartitions(apply_model_in_batch) \
    .toDF() \
    .drop('Index')

In [15]:
# inspect the predictions
df_predictions.select('predicted_duration').show()

+------------------+
|predicted_duration|
+------------------+
|26.099999999999998|
|              4.35|
|              3.15|
|             13.55|
|10.649999999999999|
|             89.65|
|120.19999999999999|
|              7.25|
|54.699999999999996|
|              15.9|
|               9.3|
|              8.75|
|             94.75|
|              46.2|
|             16.85|
|              14.6|
|             25.65|
|               5.0|
|2.9499999999999997|
|               0.0|
+------------------+
only showing top 20 rows



**In the real world, this type of application would use real-time (*streaming*) processing rather than *batch* processing, since we'd like to inform the estimated time that the trip will take to the user as soon as they request a ride in their app**