# NYC Taxi Parquet & Advanced Spark Concepts

This notebook is designed for teaching Parquet files and advanced Spark concepts using PySpark on Google Colab.

## 1. Spark Setup

In [None]:

!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz


In [None]:

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PATH"] += ":/content/spark-3.5.0-bin-hadoop3/bin"


In [None]:

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC Taxi Parquet Teaching") \
    .getOrCreate()


## 2. Load NYC Taxi Parquet Data

In [None]:

# Example: upload parquet files to /content/nyc_taxi_parquet
df = spark.read.parquet("/content/nyc_taxi_parquet/")
df.printSchema()
df.show(5)


## 3. Column Pruning Demonstration

In [None]:

df.select("pickup_datetime", "fare_amount") \
  .filter("fare_amount > 50") \
  .explain(True)


## 4. Spark Partitions

In [None]:

df.rdd.getNumPartitions()


In [None]:

df_repart = df.repartition(8)
df_repart.rdd.getNumPartitions()


## 5. Write Partitioned Parquet

In [None]:

from pyspark.sql.functions import year, month

df2 = df.withColumn("pickup_year", year("pickup_datetime")) \
        .withColumn("pickup_month", month("pickup_datetime"))


In [None]:

df2.write \
   .mode("overwrite") \
   .partitionBy("pickup_year", "pickup_month") \
   .parquet("/content/nyc_taxi_partitioned/")


## 6. Partition Pruning

In [None]:

df_part = spark.read.parquet("/content/nyc_taxi_partitioned/")

df_part.filter("pickup_year = 2023 AND pickup_month = 1") \
       .groupBy("PULocationID") \
       .count() \
       .explain(True)


## 7. Small Files Mitigation

In [None]:

df2.coalesce(4) \
   .write \
   .mode("overwrite") \
   .partitionBy("pickup_year", "pickup_month") \
   .parquet("/content/nyc_taxi_optimized/")


## 8. Analytical Query Example

In [None]:

from pyspark.sql.functions import avg

df_part.filter("pickup_year = 2023") \
       .groupBy("pickup_month") \
       .agg(
           avg("fare_amount").alias("avg_fare"),
           avg("trip_distance").alias("avg_distance")
       ) \
       .orderBy("pickup_month") \
       .show()
