# Hive style partitioning in Delta

In [1]:
import delta
import pyspark
import pyspark.sql.functions as F
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-210/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-179ee808-595f-4183-a307-c3ea75b0c9ba;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 126ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| 

23/04/06 12:32:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
df = spark.createDataFrame(
    [
        ("Ernesto", "Guevara", "Argentina"),
        ("Maria", "Sharapova", "Russia"),
        ("Bruce", "Lee", "China"),
        ("Jack", "Ma", "China"),
    ]
).toDF("first_name", "last_name", "country")

In [5]:
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----------+---------+---------+
|first_name|last_name|  country|
+----------+---------+---------+
|   Ernesto|  Guevara|Argentina|
|     Maria|Sharapova|   Russia|
|     Bruce|      Lee|    China|
|      Jack|       Ma|    China|
+----------+---------+---------+



                                                                                

In [6]:
(
    df.repartition(F.col("country"))
    .write.partitionBy("country")
    .format("delta")
    .saveAsTable("country_people")
)

                                                                                

In [7]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [01;34mcountry=Argentina[0m
│   └── [00mpart-00000-0e188daf-7ed1-4a46-9786-251e5a5b7c61.c000.snappy.parquet[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-69aeadfb-3692-4765-94bc-f4b271133b35.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-d3a4d532-74f9-4304-970d-b476cf296a07.c000.snappy.parquet[0m

4 directories, 4 files


## Add partition to Delta table

In [7]:
df = spark.createDataFrame(
    [
        ("Orlando", "Cabrera", "Colombia"),
        ("Carlos", "Vives", "Colombia"),
    ]
).toDF("first_name", "last_name", "country")

In [8]:
df.repartition(F.col("country")).write.mode("append").partitionBy("country").format(
    "delta"
).saveAsTable("country_people")

                                                                                

In [9]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
├── [01;34mcountry=Argentina[0m
│   └── [00mpart-00000-03ceafc8-b9b5-4309-8457-6e50814aaa8b.c000.snappy.parquet[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-9a8d67fa-c23d-41a4-b570-a45405f9ad78.c000.snappy.parquet[0m
├── [01;34mcountry=Colombia[0m
│   └── [00mpart-00000-7e3d3d49-39e9-4eb2-ab92-22a485291f91.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-c49ca623-ea69-4088-8d85-c7c2de30cc28.c000.snappy.parquet[0m

5 directories, 6 files


## Remove partition from Delta table

In [10]:
dt = delta.DeltaTable.forName(spark, "country_people")

In [11]:
dt.delete(F.col("country") == "Argentina")

                                                                                

In [12]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   └── [00m00000000000000000002.json[0m
├── [01;34mcountry=Argentina[0m
│   └── [00mpart-00000-03ceafc8-b9b5-4309-8457-6e50814aaa8b.c000.snappy.parquet[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-9a8d67fa-c23d-41a4-b570-a45405f9ad78.c000.snappy.parquet[0m
├── [01;34mcountry=Colombia[0m
│   └── [00mpart-00000-7e3d3d49-39e9-4eb2-ab92-22a485291f91.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-c49ca623-ea69-4088-8d85-c7c2de30cc28.c000.snappy.parquet[0m

5 directories, 7 files


In [13]:
dt = delta.DeltaTable.forName(spark, "country_people")

In [14]:
dt.toDF().show()

+----------+---------+--------+
|first_name|last_name| country|
+----------+---------+--------+
|     Maria|Sharapova|  Russia|
|   Orlando|  Cabrera|Colombia|
|    Carlos|    Vives|Colombia|
|     Bruce|      Lee|   China|
|      Jack|       Ma|   China|
+----------+---------+--------+



In [15]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

In [16]:
spark.sql("VACUUM country_people RETAIN 0 HOURS").show(truncate=False)

                                                                                

Deleted 1 files and directories in a total of 5 directories.
+-----------------------------------------------------------------------------------------------------------------+
|path                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------+
|file:/Users/matthew.powers/Documents/code/my_apps/delta-examples/notebooks/pyspark/spark-warehouse/country_people|
+-----------------------------------------------------------------------------------------------------------------+



In [17]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   └── [00m00000000000000000002.json[0m
├── [01;34mcountry=Argentina[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-9a8d67fa-c23d-41a4-b570-a45405f9ad78.c000.snappy.parquet[0m
├── [01;34mcountry=Colombia[0m
│   └── [00mpart-00000-7e3d3d49-39e9-4eb2-ab92-22a485291f91.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-c49ca623-ea69-4088-8d85-c7c2de30cc28.c000.snappy.parquet[0m

5 directories, 6 files


In [18]:
spark.sql("VACUUM country_people RETAIN 0 HOURS").show(truncate=False)

                                                                                

Deleted 1 files and directories in a total of 5 directories.
+-----------------------------------------------------------------------------------------------------------------+
|path                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------+
|file:/Users/matthew.powers/Documents/code/my_apps/delta-examples/notebooks/pyspark/spark-warehouse/country_people|
+-----------------------------------------------------------------------------------------------------------------+



In [19]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   └── [00m00000000000000000002.json[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-9a8d67fa-c23d-41a4-b570-a45405f9ad78.c000.snappy.parquet[0m
├── [01;34mcountry=Colombia[0m
│   └── [00mpart-00000-7e3d3d49-39e9-4eb2-ab92-22a485291f91.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-c49ca623-ea69-4088-8d85-c7c2de30cc28.c000.snappy.parquet[0m

4 directories, 6 files
