In [1]:
print(spark.version)

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
60,application_1752432793451_0061,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3.5.5-amzn-0

# Завдання 2: Імпорт, уніфікація та об’єднання

In [97]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, LongType
import pyspark.sql.functions as F
import datetime
import boto3
import re

date = datetime.date.today()
s3 = boto3.client("s3")

S3_BUCKET_NAME = "robot-dreams-source-data"
GREEN_PREFIX = "home-work-1/nyc_taxi/green"
YELLOW_PREFIX = "home-work-1/nyc_taxi/yellow"
YELLOW_TAXI_PATH = f"s3://{S3_BUCKET_NAME}/{YELLOW_PREFIX}/"
GREEN_TAXI_PATH = f"s3://{S3_BUCKET_NAME}/{GREEN_PREFIX}/"
TAXI_ZONE_LOOKUP_PATH = f"s3://{S3_BUCKET_NAME}/home-work-1/nyc_taxi/taxi_zone_lookup.csv"

S3_BUCKET_HW_NAME = "ovorobiov-hw"
ZONE_STATISTIC_HW_PATH = f"s3://{S3_BUCKET_HW_NAME}/results/zone_statistic/{date}/zone_statistic.parquet"
ZONE_DAYS_STATISTIC_HW_PATH = f"s3://{S3_BUCKET_HW_NAME}/results/zone_days_statstic/{date}/zone_days_statstic.parquet"

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Load data into data frame

In [None]:
def load_taxi_data(prefix, columns_to_cast):
    """
    Loading taxi data from s3 bucket to spark dataframe.

    Args:
        prefix (str): part of the path to the parquet files to green or yellow (e.g. home-work-1/nyc_taxi/green)
        columns_to_cast (list): list of strings (sql commands to cast columns into proper data types
    Returns:
        taxi_df (DataFrame): taxi data in spark dataframe
    """
    s3_response = s3.list_objects_v2(Bucket=S3_BUCKET_NAME, Prefix=f"{prefix}/")

    # Preparing paths to parquet files
    valid_parquet_files = [
        f"s3://{S3_BUCKET_NAME}/{obj['Key']}"
        for obj in s3_response.get("Contents", [])
        if obj["Key"].endswith(".parquet")
    ]

    # Data frame list
    data_frames = []
    
    # Looping through all parquet files and union them into one DF
    for parquete_file in valid_parquet_files:
        temp_df = spark.read.parquet(parquete_file).selectExpr(*columns_to_cast)
        data_frames.append(temp_df)

    taxi_df = data_frames[0]
    for df in data_frames[1:]:
        taxi_df = taxi_df.unionByName(df)
    return taxi_df
    

## Yellow Taxi Schema

In [46]:
yellow_taxi_schema = StructType([
    StructField("VendorID", LongType(), True),
    StructField("tpep_pickup_datetime", TimestampType(), True),
    StructField("tpep_dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", LongType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("RatecodeID", LongType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("PULocationID", LongType(), True),
    StructField("DOLocationID", LongType(), True),
    StructField("payment_type", LongType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("congestion_surcharge", DoubleType(), True),
    StructField("airport_fee", DoubleType(), True)
])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Green Taxi Schema

In [47]:
green_taxi_schema = StructType([
    StructField("VendorID", LongType(), True),
    StructField("lpep_pickup_datetime", TimestampType(), True),
    StructField("lpep_dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", LongType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("RatecodeID", LongType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("PULocationID", LongType(), True),
    StructField("DOLocationID", LongType(), True),
    StructField("payment_type", LongType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("congestion_surcharge", DoubleType(), True),
    StructField("ehail_fee", DoubleType(), True),
    StructField("trip_type", DoubleType(), True)
])


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Creating Yello DF

In [None]:
# The standard approach with options (option("mergeSchema", "true").option("recursiveFileLookup", "true").schema(yellow_taxi_schema)) 
# didn't work cast parquet files to proper format so it was decided to use selectExpr.

# SQL code to cast columns to proper type
yellow_casted_columns = [
    "cast(VendorID as bigint) as VendorID",
    "cast(tpep_pickup_datetime as timestamp) as tpep_pickup_datetime",
    "cast(tpep_dropoff_datetime as timestamp) as tpep_dropoff_datetime",
    "cast(passenger_count as bigint) as passenger_count",
    "cast(trip_distance as double) as trip_distance",
    "cast(RatecodeID as bigint) as RatecodeID",
    "cast(store_and_fwd_flag as string) as store_and_fwd_flag",
    "cast(PULocationID as bigint) as PULocationID",
    "cast(DOLocationID as bigint) as DOLocationID",
    "cast(payment_type as bigint) as payment_type",
    "cast(fare_amount as double) as fare_amount",
    "cast(extra as double) as extra",
    "cast(mta_tax as double) as mta_tax",
    "cast(tip_amount as double) as tip_amount",
    "cast(tolls_amount as double) as tolls_amount",
    "cast(improvement_surcharge as double) as improvement_surcharge",
    "cast(total_amount as double) as total_amount",
    "cast(congestion_surcharge as double) as congestion_surcharge",
    "cast(airport_fee as double) as airport_fee"
]

yellow_taxi_df = load_taxi_data(YELLOW_PREFIX, yellow_casted_columns)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Creating Green DF

In [None]:
# The standard approach with options (option("mergeSchema", "true").option("recursiveFileLookup", "true").schema(yellow_taxi_schema)) 
# didn't work cast parquet files to proper format so it was decided to use selectExpr.

# SQL code to cast columns to proper type
green_casted_columns = [
    "cast(VendorID as bigint) as VendorID",
    "cast(lpep_pickup_datetime as timestamp) as lpep_pickup_datetime",
    "cast(lpep_dropoff_datetime as timestamp) as lpep_dropoff_datetime",
    "cast(passenger_count as bigint) as passenger_count",
    "cast(trip_distance as double) as trip_distance",
    "cast(RatecodeID as bigint) as RatecodeID",
    "cast(store_and_fwd_flag as string) as store_and_fwd_flag",
    "cast(PULocationID as bigint) as PULocationID",
    "cast(DOLocationID as bigint) as DOLocationID",
    "cast(payment_type as bigint) as payment_type",
    "cast(fare_amount as double) as fare_amount",
    "cast(extra as double) as extra",
    "cast(mta_tax as double) as mta_tax",
    "cast(tip_amount as double) as tip_amount",
    "cast(tolls_amount as double) as tolls_amount",
    "cast(improvement_surcharge as double) as improvement_surcharge",
    "cast(total_amount as double) as total_amount",
    "cast(congestion_surcharge as double) as congestion_surcharge",
    "cast(ehail_fee as double) as ehail_fee",
    "cast(trip_type as double) as trip_type"
]

green_taxi_df = load_taxi_data(GREEN_PREFIX, green_casted_columns)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create Zone Lookup DF

In [50]:
taxi_zone_lookup_df = spark.read.csv(TAXI_ZONE_LOOKUP_PATH, header=True) \
    .select(
        F.col("LocationID").cast("int").alias("location_id"),
        F.col("Zone").alias("zone")
    )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Identifying different columns in data frames 

In [54]:
# Checking if schema is similar in green and yellow DF

yellow_schema = sorted([str(entry) for entry in yellow_taxi_df.schema.jsonValue()['fields']])
green_schem = sorted([str(entry) for entry in green_taxi_df.schema.jsonValue()['fields']])
diff = set(green_schem).symmetric_difference(set(yellow_schema))
diff_column_names = [eval(d)["name"] for d in list(diff)]
print(diff_column_names)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[]

## Preparing green data frames for merging (columns allignment) 

In [52]:
# Adding taxi_type and airport_fee columns
green_taxi_df = green_taxi_df.withColumn("taxi_type", F.lit("green"))
green_taxi_df = green_taxi_df.withColumn("airport_fee", F.lit(None).cast(DoubleType()))

# Aligning pickup_datetime and dropoff_datetime column names
green_taxi_df = green_taxi_df.withColumnRenamed("lpep_pickup_datetime", "pickup_datetime") \
                             .withColumnRenamed("lpep_dropoff_datetime", "dropoff_datetime")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Preparing yellow data frames for merging (columns allignment) 

In [53]:
# Aligning pickup_datetime and dropoff_datetime column names
yellow_taxi_df = yellow_taxi_df.withColumnRenamed("tpep_pickup_datetime", "pickup_datetime") \
                             .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")

# Adding taxi_type, trip_type and ehail_fee columns
yellow_taxi_df = yellow_taxi_df.withColumn("taxi_type", F.lit("yellow"))
yellow_taxi_df = yellow_taxi_df.withColumn("trip_type", F.lit(None).cast(DoubleType()))
yellow_taxi_df = yellow_taxi_df.withColumn("ehail_fee", F.lit(None).cast(DoubleType()))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Union two data frames yellow_taxi_df and green_taxi_df

In [None]:
# Merging Yellow and Green DF into final DF

raw_trips_df = green_taxi_df.unionByName(yellow_taxi_df)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Filterout anomalies (trip_distance < 0.1, fare_amount < 2, duration < 1 min)

In [77]:
filtered_trips_df = raw_trips_df.filter(F.col("trip_distance") >= 0.1) \
                    .filter(F.col("fare_amount") >= 2) \
                    .filter((F.col("dropoff_datetime").cast(LongType()) - F.col("pickup_datetime").cast(LongType())) >= 60)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Filtered raws: 1551959148

## Adding pickup_hour, pickup_day_of_week and duration_min columns

In [78]:
# Define the mapping using create_map
week_day_map = F.create_map([
    F.lit(1), F.lit("Sunday"),
    F.lit(2), F.lit("Monday"),
    F.lit(3), F.lit("Tuesday"),
    F.lit(4), F.lit("Wednesday"),
    F.lit(5), F.lit("Thursday"),
    F.lit(6), F.lit("Friday"),
    F.lit(7), F.lit("Saturday"),
])

filtered_trips_df = filtered_trips_df.withColumn("pickup_hour", F.hour("pickup_datetime")) \
                                     .withColumn("pickup_day_of_week", week_day_map[F.dayofweek("pickup_datetime")]) \
                                     .withColumn("duration_min", (F.round(F.col("dropoff_datetime").cast(LongType()) - F.col("pickup_datetime").cast(LongType()))/60).cast(IntegerType()))



VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Join taxi_zone_lookup_df with filtered_trips_df and add pickup_zone and dropoff_zone columns

In [79]:
full_filtered_trips_df = filtered_trips_df.join(taxi_zone_lookup_df, filtered_trips_df["PULocationID"] == taxi_zone_lookup_df.location_id, "left") \
                                            .withColumn("pickup_zone", F.when(F.col("zone").isNull(), "Unknown").otherwise(F.col("zone"))).drop(F.col("zone"), F.col("location_id")) \
                                            .join(taxi_zone_lookup_df, filtered_trips_df["DOLocationID"] == taxi_zone_lookup_df.location_id, "left") \
                                            .withColumn("dropoff_zone", F.when(F.col("zone").isNull(), "Unknown").otherwise(F.col("zone"))).drop(F.col("zone"), F.col("location_id"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Завдання 3: Створення та агрегація фінального датафрейму zone_summary

## Base summary that includes total_trips, avg_trip_distance, avg_total_amount, avg_tip_amount, max_trip_distance, min_tip_amount

In [87]:
base_summary = full_filtered_trips_df.groupBy("pickup_zone").agg(
                                    F.count("*").alias("total_trips"),
                                    F.avg("trip_distance").alias("avg_trip_distance"),
                                    F.avg("total_amount").alias("avg_total_amount"),
                                    F.avg("tip_amount").alias("avg_tip_amount"),
                                    F.max("trip_distance").alias("max_trip_distance"),
                                    F.min("tip_amount").alias("min_tip_amount"),
                                    F.sum(F.when(F.col("taxi_type") == "yellow", 1).otherwise(0)).alias("yellow_count"),
                                    F.sum(F.when(F.col("taxi_type") == "green", 1).otherwise(0)).alias("green_count")
                               )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

 ## Adding Yellow Share and Green Share and rounding the values

In [88]:


base_summary = base_summary.withColumn("yellow_share", F.col("yellow_count") / F.col("total_trips")) \
                            .withColumn("green_share", F.col("green_count") / F.col("total_trips")) \
                            .drop("yellow_count", "green_count")

final_summary = base_summary.select(
                                "pickup_zone",
                                "total_trips",
                                F.round("avg_trip_distance", 2).alias("avg_trip_distance"),
                                F.round("avg_total_amount", 2).alias("avg_total_amount"),
                                F.round("avg_tip_amount", 2).alias("avg_tip_amount"),
                                F.round("max_trip_distance", 2).alias("max_trip_distance"),
                                F.round("min_tip_amount", 2).alias("min_tip_amount"),
                                F.round("yellow_share", 2).alias("yellow_share"),
                                F.round("green_share", 2).alias("green_share")
                            )


final_summary.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----------+-----------------+----------------+--------------+-----------------+--------------+------------+-----------+
|         pickup_zone|total_trips|avg_trip_distance|avg_total_amount|avg_tip_amount|max_trip_distance|min_tip_amount|yellow_share|green_share|
+--------------------+-----------+-----------------+----------------+--------------+-----------------+--------------+------------+-----------+
|         Westerleigh|       1847|             8.61|           40.07|          2.72|            116.8|           0.0|        0.65|       0.35|
|      Pelham Parkway|     107432|             15.5|           19.56|          0.63|        221944.04|           0.0|        0.18|       0.82|
|           Rego Park|     772746|             5.03|           16.89|          0.91|        185845.97|           0.0|        0.18|       0.82|
|   Kew Gardens Hills|      80940|            16.14|           23.28|          1.15|        215534.25|           0.0|        0.46|       0.54|

## Writing summary to s3

In [89]:
final_summary.coalesce(1).write \
    .mode("overwrite") \
    .parquet(ZONE_STATISTIC_HW_PATH)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Завдання 4: Агрегація по днях тижня та зонах

## Group by pickup_zone and pickup_day_of_week. Calculate high_fare_share

In [99]:
second_summary = full_filtered_trips_df.groupBy("pickup_zone", "pickup_day_of_week").agg(
                                    F.count("*").alias("total_trips"),
                                    F.sum(F.when(F.col("fare_amount") > 30, 1).otherwise(0)).alias("total_high_fares")) \
                                    .withColumn("high_fare_share", F.round(F.col("total_high_fares") / F.col("total_trips"), 2)) \
                                    .drop("total_high_fares")
second_summary.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+------------------+-----------+---------------+
|         pickup_zone|pickup_day_of_week|total_trips|high_fare_share|
+--------------------+------------------+-----------+---------------+
|           Stapleton|         Wednesday|       1049|           0.22|
|       Rockaway Park|         Wednesday|        949|           0.79|
|             Bedford|            Monday|     162420|           0.07|
|  Murray Hill-Queens|            Monday|       4317|           0.34|
|             Astoria|           Tuesday|     713590|           0.04|
|           Glen Oaks|            Monday|       2452|           0.51|
|          Auburndale|            Friday|       3207|           0.24|
|Forest Park/Highl...|         Wednesday|       1096|           0.24|
|          Bronx Park|         Wednesday|       4190|           0.23|
|Van Cortlandt Vil...|          Saturday|      19808|           0.12|
|    Brooklyn Heights|           Tuesday|     473166|           0.07|
| Lincoln Square Wes

## Writing summary to s3

In [96]:
second_summary.coalesce(1).write \
    .mode("overwrite") \
    .parquet(ZONE_DAYS_STATISTIC_HW_PATH)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+------------------+-----------+----------------+-------------------+
|pickup_zone|pickup_day_of_week|total_trips|total_high_fares|    high_fare_share|
+-----------+------------------+-----------+----------------+-------------------+
|  Stapleton|         Wednesday|       1049|             234|  0.223069590085796|
|  Stapleton|            Friday|        925|             236|0.25513513513513514|
|  Stapleton|            Sunday|        504|             218|0.43253968253968256|
|  Stapleton|           Tuesday|        565|             213| 0.3769911504424779|
|  Stapleton|            Monday|        840|             199| 0.2369047619047619|
|  Stapleton|          Thursday|       1181|             242|0.20491109229466553|
|  Stapleton|          Saturday|       1094|             254|0.23217550274223034|
+-----------+------------------+-----------+----------------+-------------------+