# Introduction

> The OPTIMIZE command in Data lakehouse platform is used to compact small files in Delta Lake / Table into fewer, large files

# Prerequisites

## Create Catalog, Schema and Table

In [0]:
dbutils.fs.rm("dbfs:/user/hive/warehouse/quickstart_schema.db/users",True)

Out[3]: True

In [0]:
%sql
DROP SCHEMA IF EXISTS quickstart_schema CASCADE;
CREATE SCHEMA quickstart_schema;
CREATE TABLE quickstart_schema.users(
  id INT,
  name STRING,
  dob DATE,
  email STRING,
  gender STRING,
  country STRING,
  region STRING,
  city STRING,
  asset INT,
  marital_status STRING
) USING DELTA PARTITIONED BY (country, region, city);
DESCRIBE FORMATTED quickstart_schema.users;

col_name,data_type,comment
id,int,
name,string,
dob,date,
email,string,
gender,string,
country,string,
region,string,
city,string,
asset,int,
marital_status,string,


# Load data into partitioned table

In [0]:
spark.read.csv(
    path="dbfs:/FileStore/synechron/user_dataset/users_002.csv",
    header=True,
    inferSchema=True,
).write.mode("append").partitionBy("country", "region", "city").saveAsTable(
    "quickstart_schema.users"
)

In [0]:
partition_path = "dbfs:/user/hive/warehouse/quickstart_schema.db/users/country=India/region=Andaman and Nicobar Islands/city=Port Blair"
for file_info in dbutils.fs.ls(partition_path):
    print(file_info.path.split("/")[-1])

part-00000-160762c9-7bf0-4c56-b521-1d3764a47816.c000.snappy.parquet
part-00000-3738ad55-ece4-436e-8353-57004e3a486b.c000.snappy.parquet
part-00000-c2b52866-912f-40a6-a68b-9d675b34d7ec.c000.snappy.parquet


# DESCRIBE Table

In [0]:
%sql
DESCRIBE DETAIL quickstart_schema.users;

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics
delta,d688dc21-92f5-4cf9-9d02-c1abf713f0dc,spark_catalog.quickstart_schema.users,,dbfs:/user/hive/warehouse/quickstart_schema.db/users,2024-12-10T03:53:27.671+0000,2024-12-10T04:00:29.000+0000,"List(country, region, city)",531,1231803,Map(),1,2,"List(appendOnly, invariants)",Map()


# Optimize

In [0]:
%sql
-- OPTIMIZE quickstart_schema.users;
OPTIMIZE quickstart_schema.users
where
  country = 'India'
  and region = 'Andaman and Nicobar Islands'
  and city = 'Port Blair';

path,metrics
dbfs:/user/hive/warehouse/quickstart_schema.db/users,"List(1, 2, List(2808, 2808, 2808.0, 1, 2808), List(2546, 2552, 2549.0, 2, 5098), 1, null, 1, 2, 0, true, 0, 0, 1733804111660, 1733804123230, 8, 1, null, List(0, 0), 10, 10, 2861)"


In [0]:
spark.read.format("text").load(
    "dbfs:/user/hive/warehouse/quickstart_schema.db/users/_delta_log/00000000000000000003.json"
).display()

value
"{""commitInfo"":{""timestamp"":1733804120095,""userId"":""6836536383695527"",""userName"":""naveenpn.trainer@gmail.com"",""operation"":""OPTIMIZE"",""operationParameters"":{""predicate"":""[\""((('country = India) AND ('region = Andaman and Nicobar Islands)) AND ('city = Port Blair))\""]"",""zOrderBy"":""[]"",""batchId"":""0"",""auto"":false},""notebook"":{""notebookId"":""4314377066703050""},""clusterId"":""1210-034629-cmqn1uuz"",""readVersion"":2,""isolationLevel"":""SnapshotIsolation"",""isBlindAppend"":false,""operationMetrics"":{""numRemovedFiles"":""2"",""numRemovedBytes"":""5098"",""p25FileSize"":""2808"",""numDeletionVectorsRemoved"":""0"",""minFileSize"":""2808"",""numAddedFiles"":""1"",""maxFileSize"":""2808"",""p75FileSize"":""2808"",""p50FileSize"":""2808"",""numAddedBytes"":""2808""},""engineInfo"":""Databricks-Runtime/12.2.x-scala2.12"",""txnId"":""6645183a-cbe0-4f47-8db5-fc88860028c4""}}"
"{""remove"":{""path"":""country=India/region=Andaman%20and%20Nicobar%20Islands/city=Port%20Blair/part-00000-3738ad55-ece4-436e-8353-57004e3a486b.c000.snappy.parquet"",""deletionTimestamp"":1733804116034,""dataChange"":false,""extendedFileMetadata"":true,""partitionValues"":{""country"":""India"",""region"":""Andaman and Nicobar Islands"",""city"":""Port Blair""},""size"":2552,""tags"":{""INSERTION_TIME"":""1733803049000000"",""MIN_INSERTION_TIME"":""1733803049000000"",""MAX_INSERTION_TIME"":""1733803049000000"",""OPTIMIZE_TARGET_SIZE"":""268435456""}}}"
"{""remove"":{""path"":""country=India/region=Andaman%20and%20Nicobar%20Islands/city=Port%20Blair/part-00000-160762c9-7bf0-4c56-b521-1d3764a47816.c000.snappy.parquet"",""deletionTimestamp"":1733804116034,""dataChange"":false,""extendedFileMetadata"":true,""partitionValues"":{""country"":""India"",""region"":""Andaman and Nicobar Islands"",""city"":""Port Blair""},""size"":2546,""tags"":{""INSERTION_TIME"":""1733803197000000"",""MIN_INSERTION_TIME"":""1733803197000000"",""MAX_INSERTION_TIME"":""1733803197000000"",""OPTIMIZE_TARGET_SIZE"":""268435456""}}}"
"{""add"":{""path"":""country=India/region=Andaman%20and%20Nicobar%20Islands/city=Port%20Blair/part-00000-c2b52866-912f-40a6-a68b-9d675b34d7ec.c000.snappy.parquet"",""partitionValues"":{""country"":""India"",""region"":""Andaman and Nicobar Islands"",""city"":""Port Blair""},""size"":2808,""modificationTime"":1733804120000,""dataChange"":false,""stats"":""{\""numRecords\"":11,\""minValues\"":{\""id\"":7,\""name\"":\""Alec Haynes\"",\""dob\"":\""2023-05-07\"",\""email\"":\""alechaynes@gmail.com\"",\""gender\"":\""Female\"",\""asset\"":155089,\""marital_status\"":\""Common Law\""},\""maxValues\"":{\""id\"":972,\""name\"":\""Vladimir Pierce\"",\""dob\"":\""2025-03-04\"",\""email\"":\""vladimirpierce@ymail.com\"",\""gender\"":\""Transgender\"",\""asset\"":967919,\""marital_status\"":\""Single\""},\""nullCount\"":{\""id\"":0,\""name\"":0,\""dob\"":0,\""email\"":0,\""gender\"":0,\""asset\"":0,\""marital_status\"":0}}"",""tags"":{""MAX_INSERTION_TIME"":""1733803197000000"",""INSERTION_TIME"":""1733803049000000"",""MIN_INSERTION_TIME"":""1733803049000000"",""OPTIMIZE_TARGET_SIZE"":""268435456""}}}"


In [0]:
%sql
DESCRIBE HISTORY delta.`dbfs:/user/hive/warehouse/quickstart_schema.db/users`

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2024-12-10T04:15:21.000+0000,6836536383695527,naveenpn.trainer@gmail.com,OPTIMIZE,"Map(predicate -> [""((('country = India) AND ('region = Andaman and Nicobar Islands)) AND ('city = Port Blair))""], zOrderBy -> [], batchId -> 0, auto -> false)",,List(4314377066703050),1210-034629-cmqn1uuz,2.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 5098, p25FileSize -> 2808, numDeletionVectorsRemoved -> 0, minFileSize -> 2808, numAddedFiles -> 1, maxFileSize -> 2808, p75FileSize -> 2808, p50FileSize -> 2808, numAddedBytes -> 2808)",,Databricks-Runtime/12.2.x-scala2.12
2,2024-12-10T04:00:29.000+0000,6836536383695527,naveenpn.trainer@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [])",,List(4314377066703050),1210-034629-cmqn1uuz,1.0,WriteSerializable,True,"Map(numFiles -> 270, numOutputRows -> 500, numOutputBytes -> 626131)",,Databricks-Runtime/12.2.x-scala2.12
1,2024-12-10T03:58:04.000+0000,6836536383695527,naveenpn.trainer@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [])",,List(4314377066703050),1210-034629-cmqn1uuz,0.0,WriteSerializable,True,"Map(numFiles -> 261, numOutputRows -> 500, numOutputBytes -> 605672)",,Databricks-Runtime/12.2.x-scala2.12
0,2024-12-10T03:53:30.000+0000,6836536383695527,naveenpn.trainer@gmail.com,CREATE TABLE,"Map(isManaged -> true, description -> null, partitionBy -> [""country"",""region"",""city""], properties -> {})",,List(4314377066703050),1210-034629-cmqn1uuz,,WriteSerializable,True,Map(),,Databricks-Runtime/12.2.x-scala2.12
