In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [3]:
import pandas as pd
from pyspark.sql import types

In [98]:
green_schema = types.StructType([
    types.StructField("VendorID", types.LongType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.DoubleType(), True),
    types.StructField("PULocationID", types.LongType(), True),
    types.StructField("DOLocationID", types.LongType(), True),
    types.StructField("passenger_count", types.DoubleType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.DoubleType(), True),
    types.StructField("trip_type", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

yellow_schema = types.StructType([
    types.StructField("VendorID", types.LongType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.DoubleType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.DoubleType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.LongType(), True),
    types.StructField("DOLocationID", types.LongType(), True),
    types.StructField("payment_type", types.LongType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True),
    types.StructField("airport_fee", types.DoubleType(), True)
])

In [96]:
for month in range(1,13):
    for year in [2021, 2020]:
        print(f"Processing data for {year}-{month:02d}")

        input_path = f"data/raw/green/{year}/{month:02d}/"
        ouput_path = f"data/pq/green/{year}/{month:02d}/"

        df_green = spark.read \
            .option("header", "true") \
            .schema(green_schema) \
            .parquet(input_path)

        df_green \
            .repartition(4) \
            .write.parquet(ouput_path)

Processing data for 2021-01
Processing data for 2020-01


                                                                                

Processing data for 2021-02
Processing data for 2020-02


                                                                                

Processing data for 2021-03
Processing data for 2020-03


[Stage 100:>                                                        (0 + 4) / 4]                                                                                

Processing data for 2021-04
Processing data for 2020-04
Processing data for 2021-05
Processing data for 2020-05
Processing data for 2021-06
Processing data for 2020-06
Processing data for 2021-07
Processing data for 2020-07
Processing data for 2021-08
Processing data for 2020-08
Processing data for 2021-09
Processing data for 2020-09
Processing data for 2021-10
Processing data for 2020-10
Processing data for 2021-11
Processing data for 2020-11
Processing data for 2021-12
Processing data for 2020-12


In [100]:
for month in range(1,13):
    for year in [2021, 2020]:
        print(f"Processing data for {year}-{month:02d}")

        input_path = f"data/raw/yellow/{year}/{month:02d}/"
        ouput_path = f"data/pq/yellow/{year}/{month:02d}/"

        df_yellow = spark.read \
            .option("header", "true") \
            .schema(yellow_schema) \
            .parquet(input_path)

        df_yellow \
            .repartition(4) \
            .write.parquet(ouput_path)

Processing data for 2021-01


                                                                                

Processing data for 2020-01


                                                                                

Processing data for 2021-02


                                                                                

Processing data for 2020-02


                                                                                

Processing data for 2021-03


                                                                                

Processing data for 2020-03


                                                                                

Processing data for 2021-04


                                                                                

Processing data for 2020-04
Processing data for 2021-05


                                                                                

Processing data for 2020-05


                                                                                

Processing data for 2021-06


                                                                                

Processing data for 2020-06


                                                                                

Processing data for 2021-07


                                                                                

Processing data for 2020-07


                                                                                

Processing data for 2021-08


                                                                                

Processing data for 2020-08


                                                                                

Processing data for 2021-09


                                                                                

Processing data for 2020-09


                                                                                

Processing data for 2021-10


                                                                                

Processing data for 2020-10


                                                                                

Processing data for 2021-11


                                                                                

Processing data for 2020-11


                                                                                

Processing data for 2021-12


                                                                                

Processing data for 2020-12


                                                                                