# Checkpoint 1

## Fetch the data and persist

In [2]:
!wget https://archive.org/download/nycTaxiTripData2013/trip_data.7z

--2025-03-09 15:22:58--  https://archive.org/download/nycTaxiTripData2013/trip_data.7z
Resolving archive.org (archive.org)... 207.241.224.2
Connecting to archive.org (archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ia902202.us.archive.org/28/items/nycTaxiTripData2013/trip_data.7z [following]
--2025-03-09 15:22:58--  https://ia902202.us.archive.org/28/items/nycTaxiTripData2013/trip_data.7z
Resolving ia902202.us.archive.org (ia902202.us.archive.org)... 207.241.228.62
Connecting to ia902202.us.archive.org (ia902202.us.archive.org)|207.241.228.62|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4102781969 (3.8G) [application/x-7z-compressed]
Saving to: ‘trip_data.7z’


2025-03-09 16:54:49 (727 KB/s) - ‘trip_data.7z’ saved [4102781969/4102781969]



In [None]:
!mv trip_data.7z input/

In [1]:
!pip install shapely==2.0.7 delta-spark==3.3.0 py7zr==0.22.0


Collecting shapely==2.0.7
  Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (6.8 kB)
Collecting delta-spark==3.3.0
  Using cached delta_spark-3.3.0-py3-none-any.whl.metadata (2.0 kB)
Collecting py4j==0.10.9.7 (from pyspark<3.6.0,>=3.5.3->delta-spark==3.3.0)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (2.4 MB)
Using cached delta_spark-3.3.0-py3-none-any.whl (21 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, shapely, delta-spark
Successfully installed delta-spark-3.3.0 py4j-0.10.9.7 shapely-2.0.7


In [3]:
import py7zr
import sys
import os

def extract_7z(archive_path, output_dir):
    with py7zr.SevenZipFile(archive_path, mode='r') as z:
        z.extractall(path=output_dir)
    print(f"Extracted {archive_path} to {output_dir}")

archive_path = "input/trip_data.7z"
output_dir = "input/prod"
    
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
        
extract_7z(archive_path, output_dir)

Extracted input/trip_data.7z to input/prod


In [4]:
# IMPORTANT! Remove the empty space in each cell of the header, schema parsing was erroring.
# If erroring run it from the shell.
!for file in input/prod/trip_data_*.csv; do sed -i '1s/, /,/g' "$file"; done

## Spark session with its configuration

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, lag, when, count, avg, lead, sum as spark_sum
from pyspark.sql.window import Window
import json
from shapely.geometry import Point, Polygon, shape
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    DoubleType
)


In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("NYC Taxi Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.default.parallelism", "20") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

spark.sparkContext.setCheckpointDir("checkpoints")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/09 19:26:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load taxi rides dataset

In [3]:
# Define schema for the data
schema = StructType(
    [
        StructField("medallion", StringType()),
        StructField("hack_license", StringType()),
        StructField("vendor_id", StringType()),
        StructField("rate_code", StringType()),
        StructField("store_and_fwd_flag", StringType()),
        StructField("pickup_datetime", StringType()),
        StructField(
            "dropoff_datetime", StringType()
        ),
        StructField("passenger_count", IntegerType()),
        StructField("trip_time_in_secs", StringType()),
        StructField("trip_distance", StringType()),
        StructField("pickup_longitude", DoubleType()),
        StructField("pickup_latitude", DoubleType()),
        StructField("dropoff_longitude", DoubleType()),
        StructField("dropoff_latitude", DoubleType()),
    ]
)


In [4]:
# data path
# if parquet data is loaded, skip cell
prod_path = "input/trip_data_*.csv"
sample_path = "input/sample.csv"

taxi_df = (
    spark.read
    .option("header", True)
    .schema(schema)
    .csv(prod_path)
)


In [5]:
taxi_df.head()

                                                                                

Row(medallion='89D227B655E5C82AECF13C3F540D4CF4', hack_license='BA96DE419E711691B9445D6A6307C170', vendor_id='CMT', rate_code='1', store_and_fwd_flag='N', pickup_datetime='2013-01-01 15:11:48', dropoff_datetime='2013-01-01 15:18:10', passenger_count=4, trip_time_in_secs='382', trip_distance='1.00', pickup_longitude=-73.978165, pickup_latitude=40.757977, dropoff_longitude=-73.989838, dropoff_latitude=40.751171)

In [6]:
columns = ['medallion',
 'hack_license',
 'vendor_id',
 'rate_code',
 'store_and_fwd_flag',
 'pickup_datetime',
 'dropoff_datetime',
 'passenger_count',
 'trip_time_in_secs',
 'trip_distance',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude']
columns_to_drop = ["hack_license", "vendor_id", 
                   "rate_code", "store_and_fwd_flag",
                   "passenger_count", "trip_time_in_secs",
                   "trip_distance"]
df_dropped = taxi_df.drop(*columns_to_drop)

In [7]:
# skip cell if have parquet data
df_dropped.write.parquet("input/prod/taxi_data.parquet")

                                                                                