In [0]:
%sql
CREATE DATABASE IF NOT EXISTS f1_demo
LOCATION '/mnt/formula1datalakekoto/demo'

In [0]:
results_df = spark.read \
    .option('inferSchema', True) \
    .json('/mnt/formula1datalakekoto/raw/2021-03-28/results.json')

In [0]:
results_df.write.format('delta').mode('overwrite').saveAsTable('f1_demo.results')

In [0]:
%sql
CREATE TABLE f1_demo.results_external
LOCATION '/mnt/formula1datalakekoto/demo/results_external'

In [0]:
%sql
select * from f1_demo.results

In [0]:
%sql
UPDATE f1_demo.results
SET points = 11 - position
WHERE position <= 10

In [0]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, '/mnt/formula1datalakekoto/demo/results')
deltaTable.update('position <= 10', {'points': '21 - points'})

In [0]:
%sql
DELETE FROM f1_demo.results
WHERE position > 10

In [0]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, '/mnt/formula1datalakekoto/demo/results')
deltaTable.delete('points = 0')

Merge statements in Delata Lake

In [0]:
drivers_day_one_df = spark.read\
    .format('json')\
    .option('inferSchema', True)\
    .load('/mnt/formula1datalakekoto/raw/2021-03-28/drivers.json')\
    .select('driverId', 'dob', 'name.forename', 'name.surname')\
    .filter('driverId <= 10')

In [0]:
display(drivers_day_one_df)

In [0]:
drivers_day_one_df.createOrReplaceTempView('drivers_day_one')

In [0]:
from pyspark.sql.functions import upper

drivers_day_two_df = spark.read\
    .format('json')\
    .option('inferSchema', True)\
    .load('/mnt/formula1datalakekoto/raw/2021-03-28/drivers.json')\
    .select('driverId', 'dob', upper('name.forename').alias('forename'), upper('name.surname').alias('surname'))\
    .filter('driverId BETWEEN 6 AND 15')

In [0]:
drivers_day_two_df.createOrReplaceTempView('drivers_day_two')

In [0]:
display(drivers_day_two_df)

In [0]:
from pyspark.sql.functions import upper

drivers_day_three_df = spark.read\
    .format('json')\
    .option('inferSchema', True)\
    .load('/mnt/formula1datalakekoto/raw/2021-03-28/drivers.json')\
    .select('driverId', 'dob', upper('name.forename').alias('forename'), upper('name.surname').alias('surname'))\
    .filter('driverId BETWEEN 1 AND 5 OR driverId BETWEEN 16 AND 20')

In [0]:
%sql
CREATE TABLE IF NOT EXISTS f1_demo.drivers_merge (
  driverId INT, 
  dob DATE, 
  forename STRING, 
  surname STRING,
  createdate DATE,
  updatedate DATE 
)

In [0]:
%sql

--day 1

MERGE INTO f1_demo.drivers_merge tgt
USING drivers_day_one upd
ON tgt.driverId = upd.driverId
WHEN MATCHED THEN
  UPDATE SET tgt.dob = upd.dob,
             tgt.forename = upd.forename,
             tgt.surname = upd.surname,
             tgt.updatedate = current_timestamp
WHEN NOT MATCHED THEN
  INSERT (driverId, dob, forename, surname, createdate)
  VALUES (driverId, dob, forename, surname, current_timestamp)

In [0]:
%sql
select * from f1_demo.drivers_merge

In [0]:
%sql

--day 2

MERGE INTO f1_demo.drivers_merge tgt
USING drivers_day_two upd
ON tgt.driverId = upd.driverId
WHEN MATCHED THEN
  UPDATE SET tgt.dob = upd.dob,
             tgt.forename = upd.forename,
             tgt.surname = upd.surname,
             tgt.updatedate = current_timestamp
WHEN NOT MATCHED THEN
  INSERT (driverId, dob, forename, surname, createdate)
  VALUES (driverId, dob, forename, surname, current_timestamp)

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp

deltaTable = DeltaTable.forPath(spark, '/mnt/formula1datalakekoto/demo/drivers_merge')
deltaTable.alias('tgt').merge(
    drivers_day_three_df.alias('upd'),
    'tgt.driverId = upd.driverId'
)\
    .whenMatchedUpdate(set = {'tgt.dob': 'upd.dob', 'tgt.forename': 'upd.forename', 'tgt.surname': 'upd.surname', 'tgt.updatedate': 'current_timestamp()'})\
    .whenNotMatchedInsert(
        values = {
            'driverId': 'upd.driverId',
            'dob': 'upd.dob',
            'forename': 'upd.forename',
            'surname': 'upd.surname',
            'createdate': 'current_timestamp()'
        } 
    )\
.execute()

In [0]:
%sql
DESC HISTORY f1_demo.drivers_merge

In [0]:
%sql

select * from f1_demo.drivers_merge timestamp as of '2025-06-20T06:26:29.000+00:00'

In [0]:
df = spark.read.format('delta').option('timestampAsOf', '2025-06-20T06:54:09.000+00:00').load('/mnt/formula1datalakekoto/demo/drivers_merge')

In [0]:
display(df)

In [0]:
%sql
SET spark.databricks.delta.retentionDurationCheck.enabled = false;
VACUUM f1_demo.drivers_merge RETAIN 0 HOURS

In [0]:
%sql
DELETE FROM f1_demo.drivers_merge 
WHERE driverId = 1;

In [0]:
%sql
select * from f1_demo.drivers_merge

In [0]:
%sql
MERGE INTO f1_demo.drivers_merge tgt
USING f1_demo.drivers_merged VERSION AS OF 09 upd
ON tgt.driverId = upd.driverId
WHEN NOT MATCHED THEN
  INSERT (driverId, dob, forename, surname, createdate)
  VALUES (driverId, dob, forename, surname, current_timestamp)

In [0]:
%sql
RESTORE TABLE f1_demo.drivers_merge TO VERSION AS OF 10

In [0]:
%sql
CREATE TABLE IF NOT EXISTS f1_demo.drivers_convert_to_delta (
  driverId INT, 
  dob DATE, 
  forename STRING, 
  surname STRING,
  createdate DATE,
  updatedate DATE 
)
USING PARQUET

In [0]:
%sql
INSERT INTO f1_demo.drivers_convert_to_delta
SELECT * FROM f1_demo.drivers_merge

In [0]:
%sql
CONVERT TO DELTA f1_demo.drivers_convert_to_delta