In [0]:
# Load data into spark dataFrame
october_df = spark.read.option("header", True).csv("/Volumes/workspace/advecom/advecom_data/2019-Oct.csv")

# Drop table to avoid incorrect previous schemas issues
spark.sql("""DROP TABLE IF EXISTS oct_events_delta_new""")

# Write dataFrame as new delta table
october_df.write.format("delta").mode("overwrite").saveAsTable("oct_events_delta_new")

# Simulate small file problem by appending multiple times
for _ in range(8):
    october_df.limit(400).write.format("delta").mode("append").saveAsTable("oct_events_delta_new")

# Check number and size of files before optimization
tableinfo_before = spark.sql("""DESCRIBE DETAIL oct_events_delta_new""").collect()[0]
print("Before Optimization:")
print("\nNumber of Files: ",tableinfo_before['numFiles'])
print("\nTotal File Size: ",tableinfo_before['sizeInBytes'])

# Optimize table
spark.sql("""OPTIMIZE oct_events_delta_new""")

# Check number and size of files after optimization
tableinfo_after = spark.sql("""DESCRIBE DETAIL oct_events_delta_new""").collect()[0]
print("\n\nAfter Optimization:")
print("\nNumber of Files: ",tableinfo_after['numFiles'])
print("\nTotal File Size: ",tableinfo_after['sizeInBytes'])



Before Optimization:

Number of Files:  51

Total File Size:  1493565463


After Optimization:

Number of Files:  41

Total File Size:  1499012468


In [0]:
# Observation from the simulation
print("Observation from the simulation of small files problem")
print("Difference in Number of Files", tableinfo_before['numFiles'] - tableinfo_after['numFiles'])
print("Difference in File Size", tableinfo_before['sizeInBytes'] - tableinfo_after['sizeInBytes'])

Observation from the simulation of small files problem
Difference in Number of Files 10
Difference in File Size -5447005
