# Start the Spark Session

In [1]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

# create the SparkSession locally with as many CPUs cores as possible
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

# Work with Data

In [2]:
# Get all the green trip data
df_green = spark.read.parquet('data/parquet/green/*/*')

# Tell Spark that our DataFrame is a table
# NOTE: df_trips_data.registerTempTable('trips_data') is deprecated
df_green.createOrReplaceTempView('green')

In [3]:
# SQL

# Sum the total fare amount and the total number of trips 
#   by revenue zone by hour from January 2020 onward
#   ordered by hour then zone
df_green_revenue = spark.sql("""
    SELECT 
        date_trunc('hour', lpep_pickup_datetime) AS hour, 
        PULocationID AS zone,
        SUM(total_amount) AS amount,
        COUNT(1) AS number_records
    FROM
        green
    WHERE
        lpep_pickup_datetime >= '2020-01-01 00:00:00'
    GROUP BY
        1, 2
    ORDER BY
        1, 2
""")

# See the results
df_green_revenue.show(10)

+-------------------+----+------------------+--------------+
|               hour|zone|            amount|number_records|
+-------------------+----+------------------+--------------+
|2020-01-01 00:00:00|   7| 769.7299999999998|            45|
|2020-01-01 00:00:00|  17|195.03000000000003|             9|
|2020-01-01 00:00:00|  18|               7.8|             1|
|2020-01-01 00:00:00|  22|              15.8|             1|
|2020-01-01 00:00:00|  24|              87.6|             3|
|2020-01-01 00:00:00|  25|             531.0|            26|
|2020-01-01 00:00:00|  29|              61.3|             1|
|2020-01-01 00:00:00|  32| 68.94999999999999|             2|
|2020-01-01 00:00:00|  33|317.27000000000004|            11|
|2020-01-01 00:00:00|  35|129.96000000000004|             5|
+-------------------+----+------------------+--------------+
only showing top 10 rows



In [4]:
# Save these results to a report in 20 partitions
df_green_revenue \
    .repartition(20) \
    .write.parquet('./data/report/revenue/green', mode='overwrite')

In [5]:
# Do the same as above for yellow data
df_yellow = spark.read.parquet('data/parquet/yellow/*/*')

# Tell Spark that our DataFrame is a table
df_yellow.createOrReplaceTempView('yellow')

# Sum the total fare amount and the total number of trips 
#   by revenue zone by hour from January 2020 onward
#   ordered by hour then zone
df_yellow_revenue = spark.sql("""
    SELECT 
        date_trunc('hour', tpep_pickup_datetime) AS hour, 
        PULocationID AS zone,

        SUM(total_amount) AS amount,
        COUNT(1) AS number_records
    FROM
        yellow
    WHERE
        tpep_pickup_datetime >= '2020-01-01 00:00:00'
    GROUP BY
        1, 2
    ORDER BY
        1, 2
""")

# Save these results to a report in 20 partitions
df_yellow_revenue \
    .repartition(20) \
    .write.parquet('./data/report/revenue/yellow', mode='overwrite')