In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, current_timestamp, date_sub, date_format, lit, to_date

In [2]:
# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("spark-nb") \
    .master("spark://spark-master:7077") \
    .enableHiveSupport() \
    .getOrCreate()


In [12]:
# Input table name
data_table = "local_db.sample_hive_table"

# Output table name
output_table = "local_db.daily_transaction"

In [13]:
# Get current date (execution date) and calculate previous day's date
execution_date = spark.sql("SELECT current_date() as current_date").collect()[0]["current_date"]
transaction_date = spark.sql(f"SELECT date_sub('{execution_date}', 1) as transaction_date").collect()[0]["transaction_date"]


In [14]:
# Read the input data
data = spark.table(data_table)

In [15]:
# Filter and calculate total transactions for the previous day
daily_transaction = data \
    .filter(to_date(col("lpep_pickup_datetime")) == lit(transaction_date)) \
    .agg(
        lit(transaction_date).alias("transaction_date"),
        count("*").alias("total_transactions"),
        date_format(current_timestamp(), "yyyy-MM-dd HH:mm:ss").alias("calculated_at")
    )

In [16]:
# Write the result into the output table
daily_transaction.write \
    .mode("append") \
    .format("hive") \
    .saveAsTable(output_table)

In [17]:
spark.sql("SELECT * FROM local_db.sample_hive_table LIMIT 5").toPandas()

  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-02-15 18:06:41,2024-02-15 18:08:56,N,1,75,75,1,0.71,5.1,2.5,0.5,1.82,0.0,,1.0,10.92,1,1,0.0
1,2,2024-02-15 17:53:07,2024-02-15 18:08:49,N,1,82,138,1,3.73,19.1,7.5,0.5,5.62,0.0,,1.0,33.72,1,1,0.0
2,2,2024-02-15 18:04:33,2024-02-15 18:08:17,N,1,247,247,1,0.56,-5.8,-2.5,-0.5,0.0,0.0,,-1.0,-9.8,4,1,0.0
3,2,2024-02-15 18:04:33,2024-02-15 18:08:17,N,1,247,247,1,0.56,5.8,2.5,0.5,0.0,0.0,,1.0,9.8,4,1,0.0
4,2,2024-02-15 17:54:37,2024-02-15 18:08:08,N,1,75,42,1,2.6,14.9,2.5,0.5,0.0,0.0,,1.0,18.9,2,1,0.0


In [18]:
spark.sql("SELECT * FROM local_db.daily_transaction LIMIT 5").toPandas()

Unnamed: 0,transaction_date,total_transactions,calculated_at
0,2024-12-24,0,2024-12-25 04:54:49
1,2024-12-24,0,2024-12-25 04:49:20


In [19]:
# spark.sql("DROP TABLE IF EXISTS local_db.daily_transaction")

In [20]:
spark.sql("SHOW TABLES IN local_db").show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
| local_db|   daily_transaction|      false|
| local_db|   sample_hive_table|      false|
| local_db|test_from_spark_s...|      false|
+---------+--------------------+-----------+



In [21]:
# Stop the Spark session
spark.stop()