In [0]:
from delta import DeltaTable

(
    DeltaTable.create(spark)
    .tableName("default.some_people")
    .addColumn("id", "LONG")
    .addColumn("first_name", "STRING")
    .addColumn("last_name", "STRING")
    .addColumn("age", "LONG")
    .addColumn(
        "full_name", "STRING", generatedAlwaysAs="concat(first_name, ' ', last_name)"
    )
    .execute()
)


Out[1]: <delta.tables.DeltaTable at 0x7f2368b58a90>

In [0]:
spark.sql("select * from some_people").show()

+---+----------+---------+---+---------+
| id|first_name|last_name|age|full_name|
+---+----------+---------+---+---------+
+---+----------+---------+---+---------+



In [0]:
df = spark.createDataFrame(
    [(0, "Bob", "Loblaw", 23),(1, "Sue", "Grafton", None), (2, "Jim", "Carrey", 61)]
).toDF("id", "first_name", "last_name", "age")

df.write.mode("append").format("delta").saveAsTable("some_people")


In [0]:
DeltaTable.forName(spark, "some_people").toDF().show()

+---+----------+---------+----+-----------+
| id|first_name|last_name| age|  full_name|
+---+----------+---------+----+-----------+
|  0|       Bob|   Loblaw|  23| Bob Loblaw|
|  2|       Jim|   Carrey|  61| Jim Carrey|
|  1|       Sue|  Grafton|null|Sue Grafton|
+---+----------+---------+----+-----------+



#What happens when generated columns depend on columns will null values?

In [0]:
df = spark.createDataFrame(
    [
        (44, None, "Perkins", 20),
        (55, "Li", None, 30),
    ]
).toDF("id", "first_name", "last_name", "age")

df.write.mode("append").format("delta").saveAsTable(
    "some_people"
)


In [0]:
DeltaTable.forName(spark, "some_people").toDF().show()

+---+----------+---------+----+-----------+
| id|first_name|last_name| age|  full_name|
+---+----------+---------+----+-----------+
|  0|       Bob|   Loblaw|  23| Bob Loblaw|
|  2|       Jim|   Carrey|  61| Jim Carrey|
|  1|       Sue|  Grafton|null|Sue Grafton|
| 44|      null|  Perkins|  20|       null|
| 55|        Li|     null|  30|       null|
+---+----------+---------+----+-----------+



The column isn’t computed on the fly when the data is read. The data is computed and persisted when DataFrames are appended to storage.

In [0]:
%sql
CREATE TABLE default.people10m (
  id INT,
  firstName STRING,
  middleName STRING,
  lastName STRING,
  gender STRING,
  birthDate TIMESTAMP,
  dateOfBirth DATE GENERATED ALWAYS AS (CAST(birthDate AS DATE)),
  ssn STRING,
  salary INT
)

In [0]:
spark.sql("select * from people10m").show()

+---+---------+----------+--------+------+---------+-----------+---+------+
| id|firstName|middleName|lastName|gender|birthDate|dateOfBirth|ssn|salary|
+---+---------+----------+--------+------+---------+-----------+---+------+
+---+---------+----------+--------+------+---------+-----------+---+------+



In [0]:
from pyspark.sql.types import DateType
DeltaTable.create(spark) \
  .tableName("default.people10mpython") \
  .addColumn("id", "INT") \
  .addColumn("firstName", "STRING") \
  .addColumn("middleName", "STRING") \
  .addColumn("lastName", "STRING", comment = "surname") \
  .addColumn("gender", "STRING") \
  .addColumn("birthDate", "TIMESTAMP") \
  .addColumn("dateOfBirth", DateType(), generatedAlwaysAs="CAST(birthDate AS DATE)") \
  .addColumn("ssn", "STRING") \
  .addColumn("salary", "INT") \
  .execute()

Out[11]: <delta.tables.DeltaTable at 0x7f2358bcc8b0>

In [0]:
spark.sql("select * from people10mpython").show()

+---+---------+----------+--------+------+---------+-----------+---+------+
| id|firstName|middleName|lastName|gender|birthDate|dateOfBirth|ssn|salary|
+---+---------+----------+--------+------+---------+-----------+---+------+
+---+---------+----------+--------+------+---------+-----------+---+------+



In [0]:
%sql
EXPLAIN select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k;

plan
"== Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- HashAggregate(keys=[k#2498], functions=[finalmerge_sum(merge sum#2503L) AS sum(v#2499)#2500L])  +- Exchange hashpartitioning(k#2498, 200), ENSURE_REQUIREMENTS, [plan_id=1048]  +- HashAggregate(keys=[k#2498], functions=[partial_sum(v#2499) AS sum#2503L])  +- LocalTableScan [k#2498, v#2499]"


In [0]:
%sql
EXPLAIN EXTENDED select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k;

plan
"== Parsed Logical Plan == 'Aggregate ['k], ['k, unresolvedalias('sum('v), None)] +- 'SubqueryAlias t  +- 'UnresolvedInlineTable [k, v], [[1, 2], [1, 3]] == Analyzed Logical Plan == k: int, sum(v): bigint Aggregate [k#2515], [k#2515, sum(v#2516) AS sum(v)#2518L] +- SubqueryAlias t  +- LocalRelation [k#2515, v#2516] == Optimized Logical Plan == Aggregate [k#2515], [k#2515, sum(v#2516) AS sum(v)#2518L] +- LocalRelation [k#2515, v#2516] == Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- HashAggregate(keys=[k#2515], functions=[finalmerge_sum(merge sum#2520L) AS sum(v#2516)#2517L], output=[k#2515, sum(v)#2518L])  +- Exchange hashpartitioning(k#2515, 200), ENSURE_REQUIREMENTS, [plan_id=1079]  +- HashAggregate(keys=[k#2515], functions=[partial_sum(v#2516) AS sum#2520L], output=[k#2515, sum#2520L])  +- LocalTableScan [k#2515, v#2516]"


In [0]:
%sql
EXPLAIN FORMATTED select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k;

plan
"== Physical Plan == AdaptiveSparkPlan (5) +- HashAggregate (4)  +- Exchange (3)  +- HashAggregate (2)  +- LocalTableScan (1) (1) LocalTableScan Output [2]: [k#2532, v#2533] Arguments: [k#2532, v#2533] (2) HashAggregate Input [2]: [k#2532, v#2533] Keys [1]: [k#2532] Functions [1]: [partial_sum(v#2533) AS sum#2537L] Aggregate Attributes [1]: [sum#2536L] Results [2]: [k#2532, sum#2537L] (3) Exchange Input [2]: [k#2532, sum#2537L] Arguments: hashpartitioning(k#2532, 200), ENSURE_REQUIREMENTS, [plan_id=1110] (4) HashAggregate Input [2]: [k#2532, sum#2537L] Keys [1]: [k#2532] Functions [1]: [finalmerge_sum(merge sum#2537L) AS sum(v#2533)#2534L] Aggregate Attributes [1]: [sum(v#2533)#2534L] Results [2]: [k#2532, sum(v#2533)#2534L AS sum(v)#2535L] (5) AdaptiveSparkPlan Output [2]: [k#2532, sum(v)#2535L] Arguments: isFinalPlan=false"


In [0]:
%sql
EXPLAIN COST select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k;

plan
"== Optimized Logical Plan == Aggregate [k#2549], [k#2549, sum(v#2550) AS sum(v)#2552L], Statistics(sizeInBytes=40.0 B) +- LocalRelation [k#2549, v#2550], Statistics(sizeInBytes=32.0 B) == Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- HashAggregate(keys=[k#2549], functions=[finalmerge_sum(merge sum#2554L) AS sum(v#2550)#2551L], output=[k#2549, sum(v)#2552L])  +- Exchange hashpartitioning(k#2549, 200), ENSURE_REQUIREMENTS, [plan_id=1141]  +- HashAggregate(keys=[k#2549], functions=[partial_sum(v#2550) AS sum#2554L], output=[k#2549, sum#2554L])  +- LocalTableScan [k#2549, v#2550]"


In [0]:
%sql
EXPLAIN CODEGEN select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k;

plan
Found 0 WholeStageCodegen subtrees.
