# Start the SparkSession

In [1]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

# create the SparkSession locally with as many CPUs cores as possible
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [2]:
# Read in the GROUP BY'd data from 04_pyspark_groupby.ipynb
df_green_revenue = spark.read.parquet('./data/report/revenue/green')
df_yellow_revenue = spark.read.parquet('./data/report/revenue/yellow')

In [3]:
# Inspect
df_green_revenue

DataFrame[hour: timestamp, zone: int, amount: double, number_records: bigint]

In [4]:
# Inspect
df_yellow_revenue

DataFrame[hour: timestamp, zone: int, amount: double, number_records: bigint]

**So, we have hour of trip, revenue zone, revenue, and number of trips**

**Let's JOIN them together**

# Join the Data

In [5]:
# Rename some columns
df_green_revenue_tmp = df_green_revenue \
    .withColumnRenamed('amount', 'green_amount') \
    .withColumnRenamed('number_records', 'green_number_records')

df_yellow_revenue_tmp = df_yellow_revenue \
    .withColumnRenamed('amount', 'yellow_amount') \
    .withColumnRenamed('number_records', 'yellow_number_records')

In [6]:
# perform an outer JOIN so that we keep ALL records
df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on=['hour', 'zone'], how='outer')

In [7]:
df_join.head(5)

[Row(hour=datetime.datetime(2020, 1, 1, 0, 0), zone=4, green_amount=None, green_number_records=None, yellow_amount=1004.3000000000001, yellow_number_records=57),
 Row(hour=datetime.datetime(2020, 1, 1, 0, 0), zone=10, green_amount=None, green_number_records=None, yellow_amount=42.41, yellow_number_records=2),
 Row(hour=datetime.datetime(2020, 1, 1, 0, 0), zone=56, green_amount=99.69, green_number_records=3, yellow_amount=18.1, yellow_number_records=2),
 Row(hour=datetime.datetime(2020, 1, 1, 0, 0), zone=74, green_amount=317.09000000000015, green_number_records=24, yellow_amount=586.2100000000002, yellow_number_records=47),
 Row(hour=datetime.datetime(2020, 1, 1, 0, 0), zone=88, green_amount=None, green_number_records=None, yellow_amount=823.8000000000002, yellow_number_records=36)]

In [8]:
df_join.show(5)

+-------------------+----+------------------+--------------------+------------------+---------------------+
|               hour|zone|      green_amount|green_number_records|     yellow_amount|yellow_number_records|
+-------------------+----+------------------+--------------------+------------------+---------------------+
|2020-01-01 00:00:00|   4|              null|                null|1004.3000000000001|                   57|
|2020-01-01 00:00:00|  10|              null|                null|             42.41|                    2|
|2020-01-01 00:00:00|  56|             99.69|                   3|              18.1|                    2|
|2020-01-01 00:00:00|  74|317.09000000000015|                  24| 586.2100000000002|                   47|
|2020-01-01 00:00:00|  88|              null|                null| 823.8000000000002|                   36|
+-------------------+----+------------------+--------------------+------------------+---------------------+
only showing top 5 rows



**Can see that 1/1/2020 in zone 56 has both green *and* yellow trips**

In [9]:
# Write the joined data to a Parquet file
df_join.write.parquet('./data/report/revenue/total', mode='overwrite')

In [10]:
!dir data\report\revenue

 Volume in drive C has no label.
 Volume Serial Number is 08A3-CF2D

 Directory of C:\Users\nimz\Documents\de_zoomcamp\week5_batch_processing\data\report\revenue

03/01/2024  07:57 PM    <DIR>          .
03/01/2024  07:57 PM    <DIR>          ..
02/27/2024  09:11 PM             4,112 .part-00000-596e3dca-5394-4d49-9d44-c8d07b1cacb2-c000.snappy.parquet.crc
02/27/2024  09:11 PM                 8 ._SUCCESS.crc
02/29/2024  08:34 PM    <DIR>          green
02/27/2024  09:11 PM           524,936 part-00000-596e3dca-5394-4d49-9d44-c8d07b1cacb2-c000.snappy.parquet
03/01/2024  07:57 PM    <DIR>          total
02/29/2024  08:35 PM    <DIR>          yellow
02/27/2024  09:11 PM                 0 _SUCCESS
               4 File(s)        529,056 bytes
               5 Dir(s)  237,492,846,592 bytes free


# Broadcast Join in Zone Data

In [11]:
# Reload the data
df_join = spark.read.parquet('./data/report/revenue/total')

In [12]:
# Bring in zone data
df_zones = spark.read.parquet('./data/zones/')

In [13]:
# do a new JOIN with our result along with the zone data
df_result = df_join.join(df_zones, on=df_join.zone == df_zones.LocationID)

# inspect
df_result

DataFrame[hour: timestamp, zone: int, green_amount: double, green_number_records: bigint, yellow_amount: double, yellow_number_records: bigint, LocationID: string, Borough: string, Zone: string, service_zone: string]

In [14]:
df_result.head(2)

[Row(hour=datetime.datetime(2020, 1, 1, 0, 0), zone=17, green_amount=195.03000000000003, green_number_records=9, yellow_amount=220.20999999999998, yellow_number_records=8, LocationID='17', Borough='Brooklyn', Zone='Bedford', service_zone='Boro Zone'),
 Row(hour=datetime.datetime(2020, 1, 1, 0, 0), zone=22, green_amount=15.8, green_number_records=1, yellow_amount=None, yellow_number_records=None, LocationID='22', Borough='Brooklyn', Zone='Bensonhurst West', service_zone='Boro Zone')]

In [15]:
df_result.show()

+-------------------+----+------------------+--------------------+------------------+---------------------+----------+---------+--------------------+------------+
|               hour|zone|      green_amount|green_number_records|     yellow_amount|yellow_number_records|LocationID|  Borough|                Zone|service_zone|
+-------------------+----+------------------+--------------------+------------------+---------------------+----------+---------+--------------------+------------+
|2020-01-01 00:00:00|  17|195.03000000000003|                   9|220.20999999999998|                    8|        17| Brooklyn|             Bedford|   Boro Zone|
|2020-01-01 00:00:00|  22|              15.8|                   1|              null|                 null|        22| Brooklyn|    Bensonhurst West|   Boro Zone|
|2020-01-01 00:00:00|  55|129.29000000000002|                   4|              null|                 null|        55| Brooklyn|        Coney Island|   Boro Zone|
|2020-01-01 00:00:00| 

In [16]:
# Drop un-needed columns
df_result = df_result.drop('LocationID', 'zone')

In [17]:
# See that we dropped the columns
df_result.show(10)

+-------------------+------------------+--------------------+------------------+---------------------+---------+------------+
|               hour|      green_amount|green_number_records|     yellow_amount|yellow_number_records|  Borough|service_zone|
+-------------------+------------------+--------------------+------------------+---------------------+---------+------------+
|2020-01-01 00:00:00|195.03000000000003|                   9|220.20999999999998|                    8| Brooklyn|   Boro Zone|
|2020-01-01 00:00:00|              15.8|                   1|              null|                 null| Brooklyn|   Boro Zone|
|2020-01-01 00:00:00|129.29000000000002|                   4|              null|                 null| Brooklyn|   Boro Zone|
|2020-01-01 00:00:00|              null|                null| 12573.81000000003|                  721|Manhattan| Yellow Zone|
|2020-01-01 00:00:00|              null|                null| 5010.450000000001|                  266|Manhattan| Yello

In [18]:
# Save to a Parquet file
df_result.write.parquet('./data/tmp/revenue-zones')

In [19]:
!dir .\data\tmp\

 Volume in drive C has no label.
 Volume Serial Number is 08A3-CF2D

 Directory of C:\Users\nimz\Documents\de_zoomcamp\week5_batch_processing\data\tmp

03/01/2024  07:58 PM    <DIR>          .
03/01/2024  07:58 PM    <DIR>          ..
03/01/2024  07:58 PM    <DIR>          revenue-zones
               0 File(s)              0 bytes
               3 Dir(s)  237,579,874,304 bytes free
