In [2]:
import pyspark
from delta import configure_spark_with_delta_pip

# This will get wiped each time the container is shutdown and is just for demo purposes 
warehouse_location = "/tmp/spark-warehouse"

builder = pyspark.sql.SparkSession.builder.appName("Delta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.warehouse.dir", warehouse_location)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [6]:
# Create in-memory SQL DataFrame from JSON
df = spark.read.format("json").load("/health_tracker_data_2020_01.json")

In [None]:
df.printSchema()

In [7]:
# Save as a bronze table
df.write.saveAsTable("health_tracker_data_2020_01", format="delta");

In [None]:
spark.catalog.listTables()

In [9]:
from pyspark.sql.functions import column, date_format, from_unixtime

# Create cleaner silver table, converting timestamp to a time and adding a date column
df_silver = df \
  .withColumn("timestamp", from_unixtime(column("timestamp"))) \
  .withColumn("date", date_format(column("timestamp"), format="y-MM-DD"))
df.write.saveAsTable("health_tracker_silver", format="delta")

In [10]:
spark.catalog.listTables()

[Table(name='health_tracker_data_2020_01', database='default', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='health_tracker_silver', database='default', description=None, tableType='MANAGED', isTemporary=False)]

In [11]:
%ls "/tmp/spark-warehouse"

[0m[01;34mhealth_tracker_data_2020_01[0m/  [01;34mhealth_tracker_silver[0m/


In [12]:
spark.stop()