In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from delta import *
import logging

In [2]:
# Parameters
city = 'bern'
date = '2022-07-09'

In [4]:
# Paths
BRONZE_TABLE_PATH = '/opt/data_lake/bronze/house_prices_raw'
SILVER_TABLE_PATH = '/opt/data_lake/silver/house_prices_merged'

In [5]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [6]:
logger.info(f"Merging data from {city} from {date} to silver dataset ...")

2022-07-09 23:22:05,193 [INFO] Merging data from locarno from 2022-07-09 to silver dataset ...


In [7]:
# Create Spark Session
builder = (
    SparkSession
    .builder
    .appName("real-estate-etl")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-cb437f8d-6a00-4b10-8085-4156653c38a6;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.0.0 in central
	found org.antlr#antlr4;4.7 in central
	found org.antlr#antlr4-runtime;4.7 in central
	found org.antlr#antlr-runtime;3.5.2 in central
	found org.antlr#ST4;4.0.8 in central
	found org.abego.treelayout#org.abego.treelayout.core;1.0.3 in central
	found org.glassfish#javax.json;1.0.4 in central
	found com.ibm.icu#icu4j;58.2 in central
:: resolution report :: resolve 727ms :: artifacts dl 19ms
	:: modules in use:
	com.ibm.icu#icu4j;58.2 from central in [default]
	io.delta#delta-core_2.12;1.0.0 from central in [default]
	org.abego.treelayout#org.abego.treelayout.core;1.0.3 from central in [default]
	org.antlr#ST4;4.0.8 from central in [default]
	org.antlr#antlr-run

In [8]:
# Get daily data
daily_data = (
    spark
    .read
    .format('delta')
    .load(BRONZE_TABLE_PATH)
    .filter(F.col('date') == date)
    .filter(F.col('city') == city)
)

                                                                                

In [9]:
# Get golden dataset
if not DeltaTable.isDeltaTable(spark, SILVER_TABLE_PATH):
    (
        daily_data
        .write
        .format('delta')
        .partitionBy(['city'])
        .mode('overwrite')
        .option('mergeSchema', 'true')
        .save(SILVER_TABLE_PATH)
    )
    
silver_table = DeltaTable.forPath(spark, SILVER_TABLE_PATH)   

                                                                                

In [10]:
# Upsert with the new data (UPDATE previous IDs, and INSERT new ones)
(
    silver_table
    .alias("silver_table")
    .merge(daily_data.alias("daily_data"),
           "silver_table.property_id=daily_data.property_id and silver_table.city=daily_data.city")
    .whenMatchedUpdateAll()
    .whenNotMatchedInsertAll()
    .execute()
)


22/07/09 23:23:55 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [10]:
# # Vacuum table
# delta_table = DeltaTable.forPath(spark, SILVER_TABLE_PATH)
# delta_table.vacuum()
