### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) **Upsert usando MERGE**

#### **Crear Database**

In [None]:
%sql
CREATE DATABASE IF NOT EXISTS f1_demo
LOCATION '/mnt/formula1dl/demo'

#### **Leer archivo en un dataframe**

In [None]:
drivers_day1_df = spark.read \
.option("inferSchema", True) \
.json("/mnt/formula1dl/raw/2021-03-28/drivers.json") \
.filter("driverId <= 10") \
.select("driverId", "dob", "name.forename", "name.surname")

In [None]:
drivers_day1_df.show(truncate=False)

+--------+----------+---------+----------+
|driverId|dob       |forename |surname   |
+--------+----------+---------+----------+
|1       |1985-01-07|Lewis    |Hamilton  |
|2       |1977-05-10|Nick     |Heidfeld  |
|3       |1985-06-27|Nico     |Rosberg   |
|4       |1981-07-29|Fernando |Alonso    |
|5       |1981-10-19|Heikki   |Kovalainen|
|6       |1985-01-11|Kazuki   |Nakajima  |
|7       |1979-02-28|Sébastien|Bourdais  |
|8       |1979-10-17|Kimi     |Räikkönen |
|9       |1984-12-07|Robert   |Kubica    |
|10      |1982-03-18|Timo     |Glock     |
+--------+----------+---------+----------+



In [None]:
drivers_day1_df.createOrReplaceTempView("drivers_day1")

In [None]:
SELECT * FROM drivers_day1;

driverId,dob,forename,surname
1,1985-01-07,Lewis,Hamilton
2,1977-05-10,Nick,Heidfeld
3,1985-06-27,Nico,Rosberg
4,1981-07-29,Fernando,Alonso
5,1981-10-19,Heikki,Kovalainen
6,1985-01-11,Kazuki,Nakajima
7,1979-02-28,Sébastien,Bourdais
8,1979-10-17,Kimi,Räikkönen
9,1984-12-07,Robert,Kubica
10,1982-03-18,Timo,Glock


In [None]:
DESC FORMATTED drivers_day1;

col_name,data_type,comment
driverId,bigint,
dob,string,
forename,string,
surname,string,


In [None]:
from pyspark.sql.functions import upper

drivers_day2_df = spark.read \
.option("inferSchema", True) \
.json("/mnt/formula1dl/raw/2021-03-28/drivers.json") \
.filter("driverId BETWEEN 6 AND 15") \
.select("driverId", "dob", upper("name.forename").alias("forename"), upper("name.surname").alias("surname"))

In [None]:
drivers_day2_df.show(truncate=False)

+--------+----------+---------+----------+
|driverId|dob       |forename |surname   |
+--------+----------+---------+----------+
|6       |1985-01-11|KAZUKI   |NAKAJIMA  |
|7       |1979-02-28|SÉBASTIEN|BOURDAIS  |
|8       |1979-10-17|KIMI     |RÄIKKÖNEN |
|9       |1984-12-07|ROBERT   |KUBICA    |
|10      |1982-03-18|TIMO     |GLOCK     |
|11      |1977-01-28|TAKUMA   |SATO      |
|12      |1985-07-25|NELSON   |PIQUET JR.|
|13      |1981-04-25|FELIPE   |MASSA     |
|14      |1971-03-27|DAVID    |COULTHARD |
|15      |1974-07-13|JARNO    |TRULLI    |
+--------+----------+---------+----------+



In [None]:
drivers_day2_df.createOrReplaceTempView("drivers_day2")

In [None]:
SELECT * FROM drivers_day2;

driverId,dob,forename,surname
6,1985-01-11,KAZUKI,NAKAJIMA
7,1979-02-28,SÉBASTIEN,BOURDAIS
8,1979-10-17,KIMI,RÄIKKÖNEN
9,1984-12-07,ROBERT,KUBICA
10,1982-03-18,TIMO,GLOCK
11,1977-01-28,TAKUMA,SATO
12,1985-07-25,NELSON,PIQUET JR.
13,1981-04-25,FELIPE,MASSA
14,1971-03-27,DAVID,COULTHARD
15,1974-07-13,JARNO,TRULLI


In [None]:
from pyspark.sql.functions import upper

drivers_day3_df = spark.read \
.option("inferSchema", True) \
.json("/mnt/formula1dl/raw/2021-03-28/drivers.json") \
.filter("driverId BETWEEN 1 AND 5 OR driverId BETWEEN 16 AND 20") \
.select("driverId", "dob", upper("name.forename").alias("forename"), upper("name.surname").alias("surname"))

In [None]:
drivers_day3_df.show(truncate=False)

+--------+----------+---------+----------+
|driverId|dob       |forename |surname   |
+--------+----------+---------+----------+
|1       |1985-01-07|LEWIS    |HAMILTON  |
|2       |1977-05-10|NICK     |HEIDFELD  |
|3       |1985-06-27|NICO     |ROSBERG   |
|4       |1981-07-29|FERNANDO |ALONSO    |
|5       |1981-10-19|HEIKKI   |KOVALAINEN|
|16      |1983-01-11|ADRIAN   |SUTIL     |
|17      |1976-08-27|MARK     |WEBBER    |
|18      |1980-01-19|JENSON   |BUTTON    |
|19      |1979-04-18|ANTHONY  |DAVIDSON  |
|20      |1987-07-03|SEBASTIAN|VETTEL    |
+--------+----------+---------+----------+



Si creamos una tabla utilizando **Databricks SQL** y no le indicamos un **LOCATION** o un **OPTIONS(path)** la tabla se creará como **MANAGED TABLE**

In [None]:
CREATE TABLE IF NOT EXISTS f1_demo.drivers_merge (
driverId INT,
dob DATE,
forename STRING, 
surname STRING,
createdDate DATE, 
updatedDate DATE
)
USING DELTA -- No es necesario indicar que es DELTA, para que nuestra tabla sea DELTA

In [None]:
DESCRIBE FORMATTED f1_demo.drivers_merge

col_name,data_type,comment
driverId,int,
dob,date,
forename,string,
surname,string,
createdDate,date,
updatedDate,date,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,f1_demo,


Day1

In [None]:
-- t: target y s: source
MERGE INTO f1_demo.drivers_merge t
USING drivers_day1 s
ON t.driverId = s.driverId
WHEN MATCHED THEN
  UPDATE SET t.dob = s.dob,
             t.forename = s.forename,
             t.surname = s.surname,
             t.updatedDate = current_timestamp
WHEN NOT MATCHED
  THEN INSERT (t.driverId, t.dob, t.forename, t.surname, t.createdDate) VALUES (s.driverId, s.dob, s.forename, s.surname, current_timestamp)

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
10,0,0,10


In [None]:
SELECT * FROM f1_demo.drivers_merge;

driverId,dob,forename,surname,createdDate,updatedDate
1,1985-01-07,Lewis,Hamilton,2023-06-16,
2,1977-05-10,Nick,Heidfeld,2023-06-16,
3,1985-06-27,Nico,Rosberg,2023-06-16,
4,1981-07-29,Fernando,Alonso,2023-06-16,
5,1981-10-19,Heikki,Kovalainen,2023-06-16,
6,1985-01-11,Kazuki,Nakajima,2023-06-16,
7,1979-02-28,Sébastien,Bourdais,2023-06-16,
8,1979-10-17,Kimi,Räikkönen,2023-06-16,
9,1984-12-07,Robert,Kubica,2023-06-16,
10,1982-03-18,Timo,Glock,2023-06-16,


Day 2

In [None]:
MERGE INTO f1_demo.drivers_merge t
USING drivers_day2 s
ON t.driverId = s.driverId
WHEN MATCHED THEN
  UPDATE SET t.dob = s.dob,
             t.forename = s.forename,
             t.surname = s.surname,
             t.updatedDate = current_timestamp
WHEN NOT MATCHED
  THEN INSERT (t.driverId, t.dob, t.forename, t.surname, t.createdDate) VALUES (s.driverId, s.dob, s.forename, s.surname, current_timestamp)

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
10,5,0,5


In [None]:
SELECT * FROM f1_demo.drivers_merge;

driverId,dob,forename,surname,createdDate,updatedDate
6,1985-01-11,KAZUKI,NAKAJIMA,2023-06-16,2023-06-16
7,1979-02-28,SÉBASTIEN,BOURDAIS,2023-06-16,2023-06-16
8,1979-10-17,KIMI,RÄIKKÖNEN,2023-06-16,2023-06-16
9,1984-12-07,ROBERT,KUBICA,2023-06-16,2023-06-16
10,1982-03-18,TIMO,GLOCK,2023-06-16,2023-06-16
11,1977-01-28,TAKUMA,SATO,2023-06-16,
12,1985-07-25,NELSON,PIQUET JR.,2023-06-16,
13,1981-04-25,FELIPE,MASSA,2023-06-16,
14,1971-03-27,DAVID,COULTHARD,2023-06-16,
15,1974-07-13,JARNO,TRULLI,2023-06-16,


Day 3

In [None]:
from pyspark.sql.functions import current_timestamp
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, "/mnt/formula1dl/demo/drivers_merge")

deltaTable.alias("t").merge(
    drivers_day3_df.alias("s"),
    "t.driverId = s.driverId") \
  .whenMatchedUpdate(set = { "dob" : "s.dob", "forename" : "s.forename", "surname" : "s.surname", "updatedDate": "current_timestamp()" } ) \
  .whenNotMatchedInsert(values =
    {
      "driverId": "s.driverId",
      "dob": "s.dob",
      "forename" : "s.forename", 
      "surname" : "s.surname", 
      "createdDate": "current_timestamp()"
    }
  ) \
  .execute()

In [None]:
SELECT * FROM f1_demo.drivers_merge
ORDER BY driverId;

driverId,dob,forename,surname,createdDate,updatedDate
1,1985-01-07,LEWIS,HAMILTON,2023-06-16,2023-06-16
2,1977-05-10,NICK,HEIDFELD,2023-06-16,2023-06-16
3,1985-06-27,NICO,ROSBERG,2023-06-16,2023-06-16
4,1981-07-29,FERNANDO,ALONSO,2023-06-16,2023-06-16
5,1981-10-19,HEIKKI,KOVALAINEN,2023-06-16,2023-06-16
6,1985-01-11,KAZUKI,NAKAJIMA,2023-06-16,2023-06-16
7,1979-02-28,SÉBASTIEN,BOURDAIS,2023-06-16,2023-06-16
8,1979-10-17,KIMI,RÄIKKÖNEN,2023-06-16,2023-06-16
9,1984-12-07,ROBERT,KUBICA,2023-06-16,2023-06-16
10,1982-03-18,TIMO,GLOCK,2023-06-16,2023-06-16
