In [0]:
BRONZE_TABLE = 'bronze.nyc_taxi_trip_records'
SILVER_TABLE = 'silver.nyc_taxi_trip_records'
YEAR_FILTER  = 2023
START_MONTH  = 1
END_MONTH    = 5

In [0]:
import logging
from pyspark.sql.functions import col, year, month

spark.sql('CREATE DATABASE IF NOT EXISTS silver')
spark.sql(f"DROP TABLE IF EXISTS {SILVER_TABLE}")

df = spark.read.table(BRONZE_TABLE)

df = df.filter(
    (year(col('tpep_pickup_datetime')) == YEAR_FILTER) &
    (month(col('tpep_pickup_datetime')).between(START_MONTH, END_MONTH) &
     (col('total_amount') > 0) & 
     (col('passenger_count') > 0)
     )
)

selected_cols = [
    'VendorID',
    'passenger_count',
    'total_amount',
    'tpep_pickup_datetime',
    'tpep_dropoff_datetime',
    'taxi_color',
    'year',
    'month'
]
df = df.select(*selected_cols)
df = df.dropDuplicates()

(
    df.write
        .format('delta')
        .mode('append')
        .option('mergeSchema', 'true')
        .partitionBy('year', 'month')
        .saveAsTable(SILVER_TABLE)
)