# Delta Demo
1. Read Data
2. Write Managed Table
3. Write External Table
4. Write Partitioned Managed Table
5. Update Delta Table
6. Delete from Delta Table
7. Merge/Upsert to Delta Lake
8. History and Versioning
9. Vacuum (Delete History)
10. Convert Parquet Table to Delta
11. Convert Parquet File to Delta

In [0]:
# Import Modules
from delta.tables import DeltaTable
from pyspark.sql.functions import upper, current_timestamp

In [0]:
# File Date Parameter
dbutils.widgets.text("FileDate", "2021-03-28")
file_date = dbutils.widgets.get('FileDate')

In [0]:
%run "../01-Setup/09-Global-Variables"

### Read Data

In [0]:
# Read Results Data
results_df = spark.read \
    .option('inferSchema', True) \
    .json(f'{raw_inc_folder_path}/{file_date}/results.json')

# Display Data
display(results_df)

constructorId,driverId,fastestLap,fastestLapSpeed,fastestLapTime,grid,laps,milliseconds,number,points,position,positionOrder,positionText,raceId,rank,resultId,statusId,time
131,1,44,207.235,1:34.015,2,56,5523897,44,25,1,1,1,1052,4,24966,1,1:32:03.897
9,830,41,208.984,1:33.228,1,56,5524642,33,18,2,2,2,1052,2,24967,1,+0.745
131,822,56,211.566,1:32.090,3,56,5561280,77,16,3,3,3,1052,1,24968,1,+37.383
1,846,38,206.398,1:34.396,7,56,5570363,4,12,4,4,4,1052,6,24969,1,+46.466
9,815,44,207.334,1:33.970,0,56,5575944,11,10,5,5,5,1052,3,24970,1,+52.047
6,844,39,205.112,1:34.988,4,56,5582987,16,8,6,6,6,1052,11,24971,1,+59.090
1,817,36,205.233,1:34.932,6,56,5589901,3,6,7,7,7,1052,10,24972,1,+66.004
6,832,48,206.151,1:34.509,8,56,5590997,55,4,8,8,8,1052,7,24973,1,+67.100
213,852,38,205.603,1:34.761,13,56,5609589,22,2,9,9,9,1052,8,24974,1,+85.692
117,840,31,205.378,1:34.865,10,56,5610610,18,1,10,10,10,1052,9,24975,1,+86.713


### Write Managed Table

In [0]:
# Write to Delta Lake and Create Managed Table
results_df.write \
    .format('delta') \
    .mode('overwrite') \
    .saveAsTable('formula1_delta_demo.results_delta')

# .save writes to delta lake
# .saveastable writes to delta lake and creates managed table

In [0]:
%sql
-- Read Managed Table
select *
from formula1_delta_demo.results_delta;

constructorId,driverId,fastestLap,fastestLapSpeed,fastestLapTime,grid,laps,milliseconds,number,points,position,positionOrder,positionText,raceId,rank,resultId,statusId,time
131,1,44,207.235,1:34.015,2,56,5523897,44,25,1,1,1,1052,4,24966,1,1:32:03.897
9,830,41,208.984,1:33.228,1,56,5524642,33,18,2,2,2,1052,2,24967,1,+0.745
131,822,56,211.566,1:32.090,3,56,5561280,77,16,3,3,3,1052,1,24968,1,+37.383
1,846,38,206.398,1:34.396,7,56,5570363,4,12,4,4,4,1052,6,24969,1,+46.466
9,815,44,207.334,1:33.970,0,56,5575944,11,10,5,5,5,1052,3,24970,1,+52.047
6,844,39,205.112,1:34.988,4,56,5582987,16,8,6,6,6,1052,11,24971,1,+59.090
1,817,36,205.233,1:34.932,6,56,5589901,3,6,7,7,7,1052,10,24972,1,+66.004
6,832,48,206.151,1:34.509,8,56,5590997,55,4,8,8,8,1052,7,24973,1,+67.100
213,852,38,205.603,1:34.761,13,56,5609589,22,2,9,9,9,1052,8,24974,1,+85.692
117,840,31,205.378,1:34.865,10,56,5610610,18,1,10,10,10,1052,9,24975,1,+86.713


### Write External Table

In [0]:
# Write to Delta Lake
results_df.write \
    .format('delta') \
    .mode('overwrite') \
    .save(f'{delta_demo_folder_path}/results_ext')

# Read External Table
results_ext_df = spark.read \
    .format('delta') \
    .load(f'{delta_demo_folder_path}/results_ext')

# Display Data
display(results_ext_df)

constructorId,driverId,fastestLap,fastestLapSpeed,fastestLapTime,grid,laps,milliseconds,number,points,position,positionOrder,positionText,raceId,rank,resultId,statusId,time
131,1,44,207.235,1:34.015,2,56,5523897,44,25,1,1,1,1052,4,24966,1,1:32:03.897
9,830,41,208.984,1:33.228,1,56,5524642,33,18,2,2,2,1052,2,24967,1,+0.745
131,822,56,211.566,1:32.090,3,56,5561280,77,16,3,3,3,1052,1,24968,1,+37.383
1,846,38,206.398,1:34.396,7,56,5570363,4,12,4,4,4,1052,6,24969,1,+46.466
9,815,44,207.334,1:33.970,0,56,5575944,11,10,5,5,5,1052,3,24970,1,+52.047
6,844,39,205.112,1:34.988,4,56,5582987,16,8,6,6,6,1052,11,24971,1,+59.090
1,817,36,205.233,1:34.932,6,56,5589901,3,6,7,7,7,1052,10,24972,1,+66.004
6,832,48,206.151,1:34.509,8,56,5590997,55,4,8,8,8,1052,7,24973,1,+67.100
213,852,38,205.603,1:34.761,13,56,5609589,22,2,9,9,9,1052,8,24974,1,+85.692
117,840,31,205.378,1:34.865,10,56,5610610,18,1,10,10,10,1052,9,24975,1,+86.713


### Write Partitioned Managed Table

In [0]:
# Write to Delta Lake and Create Managed Table w/ Partition
results_df.write \
    .format('delta') \
    .mode('overwrite') \
    .partitionBy('constructorId') \
    .saveAsTable('formula1_delta_demo.results_part_delta')

In [0]:
%sql
-- Display Partitions
show partitions formula1_delta_demo.results_part_delta;

constructorId
1
3
6
9
51
131
117
214
210
213


### Update Delta Table

In [0]:
%sql
-- Update Managed Table SQL
update formula1_delta_demo.results_delta
  set points = 11 - position
  where position <= 10;

-- Read updated Table
select *
from formula1_delta_demo.results_delta;

constructorId,driverId,fastestLap,fastestLapSpeed,fastestLapTime,grid,laps,milliseconds,number,points,position,positionOrder,positionText,raceId,rank,resultId,statusId,time
131,1,44,207.235,1:34.015,2,56,5523897,44,10,1,1,1,1052,4,24966,1,1:32:03.897
9,830,41,208.984,1:33.228,1,56,5524642,33,9,2,2,2,1052,2,24967,1,+0.745
131,822,56,211.566,1:32.090,3,56,5561280,77,8,3,3,3,1052,1,24968,1,+37.383
1,846,38,206.398,1:34.396,7,56,5570363,4,7,4,4,4,1052,6,24969,1,+46.466
9,815,44,207.334,1:33.970,0,56,5575944,11,6,5,5,5,1052,3,24970,1,+52.047
6,844,39,205.112,1:34.988,4,56,5582987,16,5,6,6,6,1052,11,24971,1,+59.090
1,817,36,205.233,1:34.932,6,56,5589901,3,4,7,7,7,1052,10,24972,1,+66.004
6,832,48,206.151,1:34.509,8,56,5590997,55,3,8,8,8,1052,7,24973,1,+67.100
213,852,38,205.603,1:34.761,13,56,5609589,22,2,9,9,9,1052,8,24974,1,+85.692
117,840,31,205.378,1:34.865,10,56,5610610,18,1,10,10,10,1052,9,24975,1,+86.713


In [0]:
# update Managed Table PySpark
deltaTable = DeltaTable.forPath(spark, f'{delta_demo_folder_path}/results_delta')
deltaTable.update(
    'position <= 10',
    {'points': '21 - position'}
)

In [0]:
%sql
-- Read updated Table
select *
from formula1_delta_demo.results_delta;

constructorId,driverId,fastestLap,fastestLapSpeed,fastestLapTime,grid,laps,milliseconds,number,points,position,positionOrder,positionText,raceId,rank,resultId,statusId,time
131,1,44,207.235,1:34.015,2,56,5523897,44,20,1,1,1,1052,4,24966,1,1:32:03.897
9,830,41,208.984,1:33.228,1,56,5524642,33,19,2,2,2,1052,2,24967,1,+0.745
131,822,56,211.566,1:32.090,3,56,5561280,77,18,3,3,3,1052,1,24968,1,+37.383
1,846,38,206.398,1:34.396,7,56,5570363,4,17,4,4,4,1052,6,24969,1,+46.466
9,815,44,207.334,1:33.970,0,56,5575944,11,16,5,5,5,1052,3,24970,1,+52.047
6,844,39,205.112,1:34.988,4,56,5582987,16,15,6,6,6,1052,11,24971,1,+59.090
1,817,36,205.233,1:34.932,6,56,5589901,3,14,7,7,7,1052,10,24972,1,+66.004
6,832,48,206.151,1:34.509,8,56,5590997,55,13,8,8,8,1052,7,24973,1,+67.100
213,852,38,205.603,1:34.761,13,56,5609589,22,12,9,9,9,1052,8,24974,1,+85.692
117,840,31,205.378,1:34.865,10,56,5610610,18,11,10,10,10,1052,9,24975,1,+86.713


### Delete from Delta Table

In [0]:
%sql
-- Delete from Managed Table SQL
delete from formula1_delta_demo.results_delta
  where position > 10;

-- Read updated Table
select *
from formula1_delta_demo.results_delta;

constructorId,driverId,fastestLap,fastestLapSpeed,fastestLapTime,grid,laps,milliseconds,number,points,position,positionOrder,positionText,raceId,rank,resultId,statusId,time
131,1,44,207.235,1:34.015,2,56,5523897,44,20,1,1,1,1052,4,24966,1,1:32:03.897
9,830,41,208.984,1:33.228,1,56,5524642,33,19,2,2,2,1052,2,24967,1,+0.745
131,822,56,211.566,1:32.090,3,56,5561280,77,18,3,3,3,1052,1,24968,1,+37.383
1,846,38,206.398,1:34.396,7,56,5570363,4,17,4,4,4,1052,6,24969,1,+46.466
9,815,44,207.334,1:33.970,0,56,5575944,11,16,5,5,5,1052,3,24970,1,+52.047
6,844,39,205.112,1:34.988,4,56,5582987,16,15,6,6,6,1052,11,24971,1,+59.090
1,817,36,205.233,1:34.932,6,56,5589901,3,14,7,7,7,1052,10,24972,1,+66.004
6,832,48,206.151,1:34.509,8,56,5590997,55,13,8,8,8,1052,7,24973,1,+67.100
213,852,38,205.603,1:34.761,13,56,5609589,22,12,9,9,9,1052,8,24974,1,+85.692
117,840,31,205.378,1:34.865,10,56,5610610,18,11,10,10,10,1052,9,24975,1,+86.713


In [0]:
# Delete from Managed Table PySpark
deltaTable = DeltaTable.forPath(spark, f'{delta_demo_folder_path}/results_delta')
deltaTable.delete(
    'points = 0',
)

In [0]:
%sql
-- Read updated Table
select *
from formula1_delta_demo.results_delta;

constructorId,driverId,fastestLap,fastestLapSpeed,fastestLapTime,grid,laps,milliseconds,number,points,position,positionOrder,positionText,raceId,rank,resultId,statusId,time
131,1,44,207.235,1:34.015,2,56,5523897,44,20,1,1,1,1052,4,24966,1,1:32:03.897
9,830,41,208.984,1:33.228,1,56,5524642,33,19,2,2,2,1052,2,24967,1,+0.745
131,822,56,211.566,1:32.090,3,56,5561280,77,18,3,3,3,1052,1,24968,1,+37.383
1,846,38,206.398,1:34.396,7,56,5570363,4,17,4,4,4,1052,6,24969,1,+46.466
9,815,44,207.334,1:33.970,0,56,5575944,11,16,5,5,5,1052,3,24970,1,+52.047
6,844,39,205.112,1:34.988,4,56,5582987,16,15,6,6,6,1052,11,24971,1,+59.090
1,817,36,205.233,1:34.932,6,56,5589901,3,14,7,7,7,1052,10,24972,1,+66.004
6,832,48,206.151,1:34.509,8,56,5590997,55,13,8,8,8,1052,7,24973,1,+67.100
213,852,38,205.603,1:34.761,13,56,5609589,22,12,9,9,9,1052,8,24974,1,+85.692
117,840,31,205.378,1:34.865,10,56,5610610,18,11,10,10,10,1052,9,24975,1,+86.713


### Merge/Upsert to Delta Lake

In [0]:
# Create Demo Data (Day 1)
drivers_day1_df = spark.read \
    .option('inferSchema', True) \
    .json(f'{raw_inc_folder_path}/{file_date}/drivers.json') \
    .filter('driverId <= 10') \
    .select('driverId', 'dob', 'name.forename', 'name.surname')

# Create Temporary View
drivers_day1_df.createOrReplaceTempView('drivers_day1')

# Display Data
display(drivers_day1_df)

driverId,dob,forename,surname
1,1985-01-07,Lewis,Hamilton
2,1977-05-10,Nick,Heidfeld
3,1985-06-27,Nico,Rosberg
4,1981-07-29,Fernando,Alonso
5,1981-10-19,Heikki,Kovalainen
6,1985-01-11,Kazuki,Nakajima
7,1979-02-28,Sébastien,Bourdais
8,1979-10-17,Kimi,Räikkönen
9,1984-12-07,Robert,Kubica
10,1982-03-18,Timo,Glock


In [0]:
# Create Demo Data (Day 2)
drivers_day2_df = spark.read \
    .option('inferSchema', True) \
    .json(f'{raw_inc_folder_path}/{file_date}/drivers.json') \
    .filter('driverId between 6 and 15') \
    .select('driverId', 'dob', upper('name.forename').alias('forename'), upper('name.surname').alias('surname'))

# Create Temporary View
drivers_day2_df.createOrReplaceTempView('drivers_day2')

#Display Data
display(drivers_day2_df)

driverId,dob,forename,surname
6,1985-01-11,KAZUKI,NAKAJIMA
7,1979-02-28,SÉBASTIEN,BOURDAIS
8,1979-10-17,KIMI,RÄIKKÖNEN
9,1984-12-07,ROBERT,KUBICA
10,1982-03-18,TIMO,GLOCK
11,1977-01-28,TAKUMA,SATO
12,1985-07-25,NELSON,PIQUET JR.
13,1981-04-25,FELIPE,MASSA
14,1971-03-27,DAVID,COULTHARD
15,1974-07-13,JARNO,TRULLI


In [0]:
# Create Demo Data (Day 3)
drivers_day3_df = spark.read \
    .option('inferSchema', True) \
    .json(f'{raw_inc_folder_path}/{file_date}/drivers.json') \
    .filter('driverId between 1 and 5 or driverId between 16 and 20') \
    .select('driverId', 'dob', upper('name.forename').alias('forename'), upper('name.surname').alias('surname'))

# Display Data
display(drivers_day3_df)

driverId,dob,forename,surname
1,1985-01-07,LEWIS,HAMILTON
2,1977-05-10,NICK,HEIDFELD
3,1985-06-27,NICO,ROSBERG
4,1981-07-29,FERNANDO,ALONSO
5,1981-10-19,HEIKKI,KOVALAINEN
16,1983-01-11,ADRIAN,SUTIL
17,1976-08-27,MARK,WEBBER
18,1980-01-19,JENSON,BUTTON
19,1979-04-18,ANTHONY,DAVIDSON
20,1987-07-03,SEBASTIAN,VETTEL


In [0]:
%sql
-- Create Delta Table
drop table if exists formula1_delta_demo.drivers_merge;

create table if not exists formula1_delta_demo.drivers_merge (
  driverId int
  ,dob date
  ,forename string 
  ,surname string
  ,createdDate date
  ,updatedDate date
)
using delta;

In [0]:
%sql
-- Merge Data from Day 1 Using SQL
merge into formula1_delta_demo.drivers_merge as tgt
using drivers_day1 as upt
on tgt.driverId = upt.driverId
when matched then
  update set
    tgt.dob = upt.dob
    ,tgt.forename = upt.forename
    ,tgt.surname = upt.surname
    ,tgt.updatedDate = current_timestamp
when not matched then
  insert (driverId, dob, forename, surname, createdDate)
  values (driverId, dob, forename, surname, current_timestamp);

-- Display Data
select *
from formula1_delta_demo.drivers_merge;

driverId,dob,forename,surname,createdDate,updatedDate
1,1985-01-07,Lewis,Hamilton,2023-08-21,
2,1977-05-10,Nick,Heidfeld,2023-08-21,
3,1985-06-27,Nico,Rosberg,2023-08-21,
4,1981-07-29,Fernando,Alonso,2023-08-21,
5,1981-10-19,Heikki,Kovalainen,2023-08-21,
6,1985-01-11,Kazuki,Nakajima,2023-08-21,
7,1979-02-28,Sébastien,Bourdais,2023-08-21,
8,1979-10-17,Kimi,Räikkönen,2023-08-21,
9,1984-12-07,Robert,Kubica,2023-08-21,
10,1982-03-18,Timo,Glock,2023-08-21,


In [0]:
%sql
-- Merge Data from Day 2 Using SQL
merge into formula1_delta_demo.drivers_merge as tgt
using drivers_day2 as upt
on tgt.driverId = upt.driverId
when matched then
  update set
    tgt.dob = upt.dob
    ,tgt.forename = upt.forename
    ,tgt.surname = upt.surname
    ,tgt.updatedDate = current_timestamp
when not matched then
  insert (driverId, dob, forename, surname, createdDate)
  values (driverId, dob, forename, surname, current_timestamp);

-- Display Data
select *
from formula1_delta_demo.drivers_merge;

driverId,dob,forename,surname,createdDate,updatedDate
6,1985-01-11,KAZUKI,NAKAJIMA,2023-08-21,2023-08-21
7,1979-02-28,SÉBASTIEN,BOURDAIS,2023-08-21,2023-08-21
8,1979-10-17,KIMI,RÄIKKÖNEN,2023-08-21,2023-08-21
9,1984-12-07,ROBERT,KUBICA,2023-08-21,2023-08-21
10,1982-03-18,TIMO,GLOCK,2023-08-21,2023-08-21
11,1977-01-28,TAKUMA,SATO,2023-08-21,
12,1985-07-25,NELSON,PIQUET JR.,2023-08-21,
13,1981-04-25,FELIPE,MASSA,2023-08-21,
14,1971-03-27,DAVID,COULTHARD,2023-08-21,
15,1974-07-13,JARNO,TRULLI,2023-08-21,


In [0]:
# Merge Data from Day 3 Using PySpark
deltaTable = DeltaTable.forPath(spark, f'{delta_demo_folder_path}/drivers_merge')

deltaTable.alias('tgt').merge(
    drivers_day3_df.alias('upt'),
    'tgt.driverId = upt.driverId'
    ) \
    .whenMatchedUpdate(
        set = {
            'dob': 'upt.dob'
            ,'forename' : 'upt.forename'
            ,'surname' : 'upt.surname'
            ,'updatedDate' : 'current_timestamp()'
        }
    ) \
    .whenNotMatchedInsert(
        values = {
            'driverId': 'upt.driverId'
            ,'dob': 'upt.dob'
            ,'forename': 'upt.forename'
            ,'surname': 'upt.surname'
            ,'createdDate': 'current_timestamp()'
        }

    ) \
    .execute()

In [0]:
%sql
-- Display Final Merged Data
select *
from formula1_delta_demo.drivers_merge
order by driverId asc;

driverId,dob,forename,surname,createdDate,updatedDate
1,1985-01-07,LEWIS,HAMILTON,2023-08-21,2023-08-21
2,1977-05-10,NICK,HEIDFELD,2023-08-21,2023-08-21
3,1985-06-27,NICO,ROSBERG,2023-08-21,2023-08-21
4,1981-07-29,FERNANDO,ALONSO,2023-08-21,2023-08-21
5,1981-10-19,HEIKKI,KOVALAINEN,2023-08-21,2023-08-21
6,1985-01-11,KAZUKI,NAKAJIMA,2023-08-21,2023-08-21
7,1979-02-28,SÉBASTIEN,BOURDAIS,2023-08-21,2023-08-21
8,1979-10-17,KIMI,RÄIKKÖNEN,2023-08-21,2023-08-21
9,1984-12-07,ROBERT,KUBICA,2023-08-21,2023-08-21
10,1982-03-18,TIMO,GLOCK,2023-08-21,2023-08-21


### History and Versioning

In [0]:
%sql
-- Show Table History
desc history formula1_delta_demo.drivers_merge;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
7,2023-08-21T15:06:47.000+0000,7187749001525811,nick.reyes@sebrands.com,VACUUM END,Map(status -> COMPLETED),,List(2007731451434353),0615-130520-q9luod2y,6.0,SnapshotIsolation,True,"Map(numDeletedFiles -> 0, numVacuumedDirectories -> 1)",,Databricks-Runtime/12.2.x-scala2.12
6,2023-08-21T15:06:43.000+0000,7187749001525811,nick.reyes@sebrands.com,VACUUM START,"Map(retentionCheckEnabled -> false, defaultRetentionMillis -> 604800000, specifiedRetentionMillis -> 0)",,List(2007731451434353),0615-130520-q9luod2y,5.0,SnapshotIsolation,True,"Map(numFilesToDelete -> 0, sizeOfDataToDelete -> 0)",,Databricks-Runtime/12.2.x-scala2.12
5,2023-08-21T15:04:58.000+0000,7187749001525811,nick.reyes@sebrands.com,VACUUM END,Map(status -> COMPLETED),,List(2007731451434353),0615-130520-q9luod2y,4.0,SnapshotIsolation,True,"Map(numDeletedFiles -> 2, numVacuumedDirectories -> 1)",,Databricks-Runtime/12.2.x-scala2.12
4,2023-08-21T15:04:53.000+0000,7187749001525811,nick.reyes@sebrands.com,VACUUM START,"Map(retentionCheckEnabled -> false, defaultRetentionMillis -> 604800000, specifiedRetentionMillis -> 0)",,List(2007731451434353),0615-130520-q9luod2y,3.0,SnapshotIsolation,True,"Map(numFilesToDelete -> 2, sizeOfDataToDelete -> 3987)",,Databricks-Runtime/12.2.x-scala2.12
3,2023-08-21T14:48:59.000+0000,7187749001525811,nick.reyes@sebrands.com,MERGE,"Map(predicate -> [""(cast(driverId#4116 as bigint) = driverId#194L)""], matchedPredicates -> [{""actionType"":""update""}], notMatchedPredicates -> [{""actionType"":""insert""}], notMatchedBySourcePredicates -> [])",,List(2007731451434353),0615-130520-q9luod2y,2.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 2134, numTargetBytesRemoved -> 1914, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 5, executionTimeMs -> 2491, numTargetRowsInserted -> 5, numTargetRowsMatchedDeleted -> 0, scanTimeMs -> 1130, numTargetRowsUpdated -> 5, numOutputRows -> 10, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 10, numTargetFilesRemoved -> 1, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 1012)",,Databricks-Runtime/12.2.x-scala2.12
2,2023-08-21T14:35:58.000+0000,7187749001525811,nick.reyes@sebrands.com,MERGE,"Map(predicate -> [""(cast(driverId#2944 as bigint) = driverId#355L)""], matchedPredicates -> [{""actionType"":""update""}], notMatchedPredicates -> [{""actionType"":""insert""}], notMatchedBySourcePredicates -> [])",,List(2007731451434353),0615-130520-q9luod2y,1.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 5, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 2, numTargetBytesAdded -> 4052, numTargetBytesRemoved -> 2073, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 5, executionTimeMs -> 3161, numTargetRowsInserted -> 5, numTargetRowsMatchedDeleted -> 0, scanTimeMs -> 1454, numTargetRowsUpdated -> 5, numOutputRows -> 15, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 10, numTargetFilesRemoved -> 1, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 1342)",,Databricks-Runtime/12.2.x-scala2.12
1,2023-08-21T14:34:45.000+0000,7187749001525811,nick.reyes@sebrands.com,MERGE,"Map(predicate -> [""(cast(driverId#1910 as bigint) = driverId#239L)""], matchedPredicates -> [{""actionType"":""update""}], notMatchedPredicates -> [{""actionType"":""insert""}], notMatchedBySourcePredicates -> [])",,List(2007731451434353),0615-130520-q9luod2y,0.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 2073, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 4954, numTargetRowsInserted -> 10, numTargetRowsMatchedDeleted -> 0, scanTimeMs -> 1689, numTargetRowsUpdated -> 0, numOutputRows -> 10, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 10, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 2807)",,Databricks-Runtime/12.2.x-scala2.12
0,2023-08-21T14:32:55.000+0000,7187749001525811,nick.reyes@sebrands.com,CREATE TABLE,"Map(isManaged -> true, description -> null, partitionBy -> [], properties -> {})",,List(2007731451434353),0615-130520-q9luod2y,,WriteSerializable,True,Map(),,Databricks-Runtime/12.2.x-scala2.12


In [0]:
%sql
-- Display Data by Version or Timestamp Using SQL
select *
from formula1_delta_demo.drivers_merge
version as of 1;
-- timestamp as of '2023-08-21T14:34:45.000+0000'

driverId,dob,forename,surname,createdDate,updatedDate
1,1985-01-07,Lewis,Hamilton,2023-08-21,
2,1977-05-10,Nick,Heidfeld,2023-08-21,
3,1985-06-27,Nico,Rosberg,2023-08-21,
4,1981-07-29,Fernando,Alonso,2023-08-21,
5,1981-10-19,Heikki,Kovalainen,2023-08-21,
6,1985-01-11,Kazuki,Nakajima,2023-08-21,
7,1979-02-28,Sébastien,Bourdais,2023-08-21,
8,1979-10-17,Kimi,Räikkönen,2023-08-21,
9,1984-12-07,Robert,Kubica,2023-08-21,
10,1982-03-18,Timo,Glock,2023-08-21,


In [0]:
# Display Data by Version Using PySpark
df_version = spark.read \
    .format('delta') \
    .option('versionAsOf', '1') \
    .load(f'{delta_demo_folder_path}/drivers_merge')

display(df_version)

driverId,dob,forename,surname,createdDate,updatedDate
1,1985-01-07,Lewis,Hamilton,2023-08-21,
2,1977-05-10,Nick,Heidfeld,2023-08-21,
3,1985-06-27,Nico,Rosberg,2023-08-21,
4,1981-07-29,Fernando,Alonso,2023-08-21,
5,1981-10-19,Heikki,Kovalainen,2023-08-21,
6,1985-01-11,Kazuki,Nakajima,2023-08-21,
7,1979-02-28,Sébastien,Bourdais,2023-08-21,
8,1979-10-17,Kimi,Räikkönen,2023-08-21,
9,1984-12-07,Robert,Kubica,2023-08-21,
10,1982-03-18,Timo,Glock,2023-08-21,


In [0]:
# Display Data by Timestamp Using PySpark
df_timestamp = spark.read \
    .format('delta') \
    .option('timestampAsOf', '2023-08-21T14:34:45.000+0000') \
    .load(f'{delta_demo_folder_path}/drivers_merge')

display(df_timestamp)

driverId,dob,forename,surname,createdDate,updatedDate
1,1985-01-07,Lewis,Hamilton,2023-08-21,
2,1977-05-10,Nick,Heidfeld,2023-08-21,
3,1985-06-27,Nico,Rosberg,2023-08-21,
4,1981-07-29,Fernando,Alonso,2023-08-21,
5,1981-10-19,Heikki,Kovalainen,2023-08-21,
6,1985-01-11,Kazuki,Nakajima,2023-08-21,
7,1979-02-28,Sébastien,Bourdais,2023-08-21,
8,1979-10-17,Kimi,Räikkönen,2023-08-21,
9,1984-12-07,Robert,Kubica,2023-08-21,
10,1982-03-18,Timo,Glock,2023-08-21,


### Vaccum (Delete History)

In [0]:
%sql
-- Delete History
set spark.databricks.delta.retentionDurationCheck.enabled = false; -- default 7 days, option needed to set lower

vacuum formula1_delta_demo.drivers_merge
retain 0 hours;

-- Display Data (Only Latest Version Avaialble)
select *
from formula1_delta_demo.drivers_merge;

driverId,dob,forename,surname,createdDate,updatedDate
6,1985-01-11,KAZUKI,NAKAJIMA,2023-08-21,2023-08-21
7,1979-02-28,SÉBASTIEN,BOURDAIS,2023-08-21,2023-08-21
8,1979-10-17,KIMI,RÄIKKÖNEN,2023-08-21,2023-08-21
9,1984-12-07,ROBERT,KUBICA,2023-08-21,2023-08-21
10,1982-03-18,TIMO,GLOCK,2023-08-21,2023-08-21
11,1977-01-28,TAKUMA,SATO,2023-08-21,
12,1985-07-25,NELSON,PIQUET JR.,2023-08-21,
13,1981-04-25,FELIPE,MASSA,2023-08-21,
14,1971-03-27,DAVID,COULTHARD,2023-08-21,
15,1974-07-13,JARNO,TRULLI,2023-08-21,


### Convert Parquet Table to Delta

In [0]:
%sql
-- Create Parquet Table
drop table if exists formula1_delta_demo.drivers_convert_delta;

create table if not exists formula1_delta_demo.drivers_convert_delta (
  driverId int
  ,dob date
  ,forename string 
  ,surname string
  ,createdDate date
  ,updatedDate date
)
using parquet;

In [0]:
%sql
-- Insert Into Parquet Table
insert into formula1_delta_demo.drivers_convert_delta
  select *
  from formula1_delta_demo.drivers_merge;

-- Display Data
select *
from formula1_delta_demo.drivers_convert_delta;

driverId,dob,forename,surname,createdDate,updatedDate
6,1985-01-11,KAZUKI,NAKAJIMA,2023-08-21,2023-08-21
7,1979-02-28,SÉBASTIEN,BOURDAIS,2023-08-21,2023-08-21
8,1979-10-17,KIMI,RÄIKKÖNEN,2023-08-21,2023-08-21
9,1984-12-07,ROBERT,KUBICA,2023-08-21,2023-08-21
10,1982-03-18,TIMO,GLOCK,2023-08-21,2023-08-21
11,1977-01-28,TAKUMA,SATO,2023-08-21,
12,1985-07-25,NELSON,PIQUET JR.,2023-08-21,
13,1981-04-25,FELIPE,MASSA,2023-08-21,
14,1971-03-27,DAVID,COULTHARD,2023-08-21,
15,1974-07-13,JARNO,TRULLI,2023-08-21,


In [0]:
%sql
-- Convert Parquet Table to Delta
convert to delta formula1_delta_demo.drivers_convert_delta;

### Convert Parquet File to Delta

In [0]:
# Create Parquet DataFrame
parquet_df = spark.table('formula1_delta_demo.drivers_convert_delta')

# Create Parquet File
parquet_df.write \
    .format('parquet') \
    .save(f'{delta_demo_folder_path}/drivers_convert_delta_from_file')

In [0]:
%sql
-- Convert Parquet File to Delta
convert to delta parquet.`abfss://delta-demo@dbcourselakehouse.dfs.core.windows.net/drivers_convert_delta_from_file`