# Delta tables with ArrayType columns

In [1]:
import delta
import pyspark
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3f29f8f2-7d67-4a28-b642-76748e463152;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 95ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     

In [4]:
df = spark.createDataFrame([("abc", [1, None]), ("cd", [3, 4])], ["id", "numbers"])

In [5]:
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+---------+
| id|  numbers|
+---+---------+
|abc|[1, null]|
| cd|   [3, 4]|
+---+---------+



                                                                                

In [6]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [7]:
df.write.format("delta").save("tmp/some_delta_lake")

23/07/22 11:15:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


## Nested null values

In [13]:
df2 = spark.createDataFrame(
    [("abc", [1, [None, 3]]), ("cd", [3, [4, 5]])], ["id", "numbers"]
)

TypeError: Unable to infer the type of the field numbers.

In [9]:
from pyspark.sql.types import *
from pyspark.sql.functions import col

In [24]:
data2 = [
    ("James", "", "Smith", "36636", "M", 3000, [[1, None], [2, 2]]),
    ("Michael", "Rose", "", "40288", "M", 4000, [[3, 3], [4, 4]]),
]

schema = StructType(
    [
        StructField("firstname", StringType(), True),
        StructField("middlename", StringType(), True),
        StructField("lastname", StringType(), True),
        StructField("id", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("salary", IntegerType(), True),
        StructField("nums", ArrayType(ArrayType(IntegerType(), True), True), True),
    ]
)

In [25]:
df = spark.createDataFrame(data=data2, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- nums: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: integer (containsNull = true)

+---------+----------+--------+-----+------+------+-------------------+
|firstname|middlename|lastname|id   |gender|salary|nums               |
+---------+----------+--------+-----+------+------+-------------------+
|James    |          |Smith   |36636|M     |3000  |[[1, null], [2, 2]]|
|Michael  |Rose      |        |40288|M     |4000  |[[3, 3], [4, 4]]   |
+---------+----------+--------+-----+------+------+-------------------+



In [26]:
df.write.format("delta").save("tmp/some_delta_lake2")

In [27]:
data2 = [
    ("James", "", "Smith", "36636", "M", 3000, [[1, None], [2, 2]], None),
    ("Michael", "Rose", "", "40288", "M", 4000, [[3, 3], [4, 4]], None),
]

schema = StructType(
    [
        StructField("firstname", StringType(), True),
        StructField("middlename", StringType(), True),
        StructField("lastname", StringType(), True),
        StructField("id", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("salary", IntegerType(), True),
        StructField("nums", ArrayType(ArrayType(IntegerType(), True), True), True),
        StructField("whatever", NullType(), True),
    ]
)

In [28]:
df = spark.createDataFrame(data=data2, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- nums: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: integer (containsNull = true)
 |-- whatever: void (nullable = true)

+---------+----------+--------+-----+------+------+-------------------+--------+
|firstname|middlename|lastname|id   |gender|salary|nums               |whatever|
+---------+----------+--------+-----+------+------+-------------------+--------+
|James    |          |Smith   |36636|M     |3000  |[[1, null], [2, 2]]|null    |
|Michael  |Rose      |        |40288|M     |4000  |[[3, 3], [4, 4]]   |null    |
+---------+----------+--------+-----+------+------+-------------------+--------+



In [29]:
df.write.format("delta").save("tmp/some_delta_lake3")

In [31]:
data2 = [
    ("James", "", "Smith", "36636", "M", 3000, [[None, None], [None, None]], None),
    ("Michael", "Rose", "", "40288", "M", 4000, [[None, None], [None, None]], None),
]

schema = StructType(
    [
        StructField("firstname", StringType(), True),
        StructField("middlename", StringType(), True),
        StructField("lastname", StringType(), True),
        StructField("id", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("salary", IntegerType(), True),
        StructField("nums", ArrayType(ArrayType(NullType(), True), True), True),
        StructField("whatever", NullType(), True),
    ]
)

In [32]:
df = spark.createDataFrame(data=data2, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- nums: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: void (containsNull = true)
 |-- whatever: void (nullable = true)

+---------+----------+--------+-----+------+------+----------------------------+--------+
|firstname|middlename|lastname|id   |gender|salary|nums                        |whatever|
+---------+----------+--------+-----+------+------+----------------------------+--------+
|James    |          |Smith   |36636|M     |3000  |[[null, null], [null, null]]|null    |
|Michael  |Rose      |        |40288|M     |4000  |[[null, null], [null, null]]|null    |
+---------+----------+--------+-----+------+------+----------------------------+--------+



In [33]:
df.write.format("delta").save("tmp/some_delta_lake4")

AnalysisException:  Found nested NullType in column nums which is of ArrayType. Delta doesn't support writing NullType in complex types.

In [52]:
desired_type = StructField("nums", ArrayType(ArrayType(IntegerType(), True), True), True)

In [54]:
df = df.withColumn("nums", col("nums").cast(desired_type))

IllegalArgumentException: Failed to convert the JSON string '{"metadata":{},"name":"nums","nullable":true,"type":{"containsNull":true,"elementType":{"containsNull":true,"elementType":"integer","type":"array"},"type":"array"}}' to a data type.

In [43]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- nums: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: void (containsNull = true)
 |-- whatever: void (nullable = true)



In [10]:
data2 = [
    ("James", [[None, None], [None, None]]),
    ("Michael", [[None, None], [None, None]]),
]

schema = StructType(
    [
        StructField("firstname", StringType(), True),
        StructField("nums", ArrayType(ArrayType(NullType(), True), True), True),
    ]
)

In [11]:
df = spark.createDataFrame(data=data2, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- nums: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: void (containsNull = true)

+---------+----------------------------+
|firstname|nums                        |
+---------+----------------------------+
|James    |[[null, null], [null, null]]|
|Michael  |[[null, null], [null, null]]|
+---------+----------------------------+



In [12]:
desired_type = ArrayType(ArrayType(IntegerType(), True), True)

In [13]:
df = df.withColumn("nums", col("nums").cast(desired_type))

In [14]:
df.schema

StructType([StructField('firstname', StringType(), True), StructField('nums', ArrayType(ArrayType(IntegerType(), True), True), True)])

In [15]:
df.write.format("delta").save("tmp/some_delta_lake5")

### Another way to cast the column

In [16]:
data2 = [
    ("James", [[None, None], [None, None]]),
    ("Michael", [[None, None], [None, None]]),
]

schema = StructType(
    [
        StructField("firstname", StringType(), True),
        StructField("nums", ArrayType(ArrayType(NullType(), True), True), True),
    ]
)

In [17]:
df = spark.createDataFrame(data=data2, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- nums: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: void (containsNull = true)

+---------+----------------------------+
|firstname|nums                        |
+---------+----------------------------+
|James    |[[null, null], [null, null]]|
|Michael  |[[null, null], [null, null]]|
+---------+----------------------------+



In [18]:
df = df.withColumn("nums", col("nums").cast("array<array<int>>"))

In [19]:
df.show()

+---------+--------------------+
|firstname|                nums|
+---------+--------------------+
|    James|[[null, null], [n...|
|  Michael|[[null, null], [n...|
+---------+--------------------+



In [21]:
df.schema

StructType([StructField('firstname', StringType(), True), StructField('nums', ArrayType(ArrayType(IntegerType(), True), True), True)])

In [20]:
df.write.format("delta").save("tmp/some_delta_lake6")