# Imports and Setup

In [0]:
from pyspark.sql.functions import col, upper, lower, initcap, trim, from_json, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from delta.tables import DeltaTable

In [0]:
catalog = "fuel"
# bronze tables
bronze_stations_table = f"{catalog}.bronze.nsw_fuel_stations_bronze"
bronze_prices_table = f"{catalog}.bronze.nsw_fuel_prices_bronze"
# silver tables
silver_state_table = f"{catalog}.silver.state"
silver_brand_table = f"{catalog}.silver.brand"
silver_station_table = f"{catalog}.silver.station"
silver_fuel_table = f"{catalog}.silver.fuel"
silver_price_table = f"{catalog}.silver.price"

In [0]:
price_stream_df = (
    spark.readStream
        .format("delta")
        .table(bronze_prices_table)
)

In [0]:
def clean_name_string_col(col_exp):
    return initcap(lower(trim(col_exp)))

def clean_acronym_string_col(col_exp):
    return upper(trim(col_exp))

# Silver Table Batch Functions

## Fuel Table

In [0]:
def ingest_into_silver_fuel_table(batch_df, batch_id: int):

  batch_df.createOrReplaceTempView("temp_batch_prices")

  # clean up and make fuel unique
  cleaned_unique_fuel_df = spark.sql(f"""
    select distinct upper(trim(fueltype)) as fuel_code
      from temp_batch_prices
      where fueltype is not null                    
  """)
  cleaned_unique_fuel_df.createOrReplaceTempView("temp_cleaned_fuels")
  # cleaned_unique_fuel_df.show()

  # create silver fuel table if it doesn't exist
  spark.sql(f"""
    create table if not exists {silver_fuel_table} (
      fuel_id bigint generated by default as identity,
      fuel_code string not null
    ) using delta
  """)

  # find new fuels not in silver fuel table
  fuels_to_add_df = spark.sql(f"""
    select tcf.fuel_code 
      from temp_cleaned_fuels tcf left join {silver_fuel_table} sfs on tcf.fuel_code = sfs.fuel_code
      where sfs.fuel_code is null
  """)
  fuels_to_add_df.createOrReplaceTempView("temp_fuels_to_add")

  # insert new states into silver state table
  spark.sql(f"""
    insert into {silver_fuel_table} (fuel_code)
      select fuel_code
        from temp_fuels_to_add
  """)

## Price Table

In [0]:
def ingest_into_silver_price_table(batch_df, batch_id: int):

    # replace stationcode with station_id
    station_df = spark.read.table(silver_station_table)
    price_df = batch_df.join(station_df, on=batch_df.stationcode == station_df.station_code, how="left")
    # price_df = (
    #     price_df.select()
    # )

    # prepare fueltype
    price_df = price_df.withColumn(
        "fuel_code",
        clean_acronym_string_col(col("fueltype"))
    )

    # replace fueltype with fuel_id
    fuel_df = spark.read.table(silver_fuel_table)
    price_df = price_df.join(fuel_df, on=price_df.fuel_code == fuel_df.fuel_code, how="left")

    # convert lastupdated (dd/MM/yyyy HH:mm:ss) to datetime
    price_df = price_df.withColumn(
        "changed_at",
        to_timestamp(col("lastupdated"), "dd/MM/yyyy HH:mm:ss")
    )

    # drop unnecessary columns
    price_df = (
        price_df.select(
            "station_id",
            "fuel_id",
            "price",
            "changed_at"
        )
    )

    # remove prices above 995 and at 0
    price_df = price_df.filter((price_df.price > 0) & (price_df.price < 995))

    # create price table if not exists
    spark.sql(f"""
        create table if not exists {silver_price_table} (
            price_id bigint generated by default as identity,
            station_id bigint not null,
            fuel_id bigint not null,
            price double,
            changed_at timestamp
        ) using delta          
    """)

    # insert new price data
    price_df.createOrReplaceTempView("temp_new_prices")
    spark.sql(f"""
        insert into {silver_price_table} (station_id, fuel_id, price, changed_at)
            select station_id, fuel_id, price, changed_at
            from temp_new_prices
    """)

# Write Stream

In [0]:
def bronze_to_silver(batch_df, batch_id: int):

    # process price based changes
    ingest_into_silver_fuel_table(batch_df, batch_id)
    ingest_into_silver_price_table(batch_df, batch_id)

query = (
    price_stream_df.writeStream
        .option("checkpointLocation", "/Workspace/nsw-fuel-project/silver_price_checkpoint")
        .trigger(availableNow=True)  # batch processinsg of all available changes
        .foreachBatch(bronze_to_silver)
        .start()
)

query.awaitTermination()