In [1]:
import pyspark
from pyspark.sql.functions import explode, col
from delta.pip_utils import configure_spark_with_delta_pip

In [8]:
spark = pyspark.sql.SparkSession.builder \
    .appName('Lab9') \
    .config('spark.driver.extraClassPath', '/opt/sqljdbc_4.2/enu/jre8/sqljdbc42.jar') \
    .config('spark.executor.extraClassPath', '/opt/sqljdbc_4.2/enu/jre8/sqljdbc42.jar') \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \

spark = configure_spark_with_delta_pip(spark).getOrCreate()

#### Spark server is accessible here (click on Spark UI):

In [9]:
spark

In [10]:
def display_head(df: pyspark.sql.DataFrame, n: int):
    return df.limit(n).toPandas().head(n)

In [11]:
df = spark.read \
      .format("json") \
      .option("multiline", "true") \
      .load("Nested.json")

In [12]:
df = df.select('*', 'pathLinkInfo.*').drop('pathLinkInfo').select('*', 'elevationGain.*').drop('elevationGain')

In [13]:
display_head(df, 2)

Unnamed: 0,CaptureDate,CaptureID,FID,Geometry,attribute1,changeType_source,identifier,alternateName,captureSpecification,cycleFacility,...,formsPartOfStreet,heightingMethod,matchStatus,pathName,sourceFID,startGradeSeparation,startNode,surfaceType,elevationAgainstDirection,elevationInDirection
0,2020-12-31T00:00:00,3,24b3sdf4353fc220,Geo223,Under Construction,insert,2,,Urban,,...,[4fb3afad-2c56-4946-83d1-d734508705e2],DTM,Matched,,MissingFID,0,147b7a78-04bf-4f9a-801d-61dc534a0a5,Sealed on surface,1.3,0


In [14]:
df.columns

['CaptureDate',
 'CaptureID',
 'FID',
 'Geometry',
 'attribute1',
 'changeType_source',
 'identifier',
 'alternateName',
 'captureSpecification',
 'cycleFacility',
 'endGradeSeparation',
 'endNode',
 'fictitious',
 'formOfWay',
 'formsPartOfPath',
 'formsPartOfStreet',
 'heightingMethod',
 'matchStatus',
 'pathName',
 'sourceFID',
 'startGradeSeparation',
 'startNode',
 'surfaceType',
 'elevationAgainstDirection',
 'elevationInDirection']

In [15]:
df = df.drop("attribute1", "endGradeSeparation", "elevationAgainstDirection","formsPartOfPath" )

In [16]:
df.columns

['CaptureDate',
 'CaptureID',
 'FID',
 'Geometry',
 'changeType_source',
 'identifier',
 'alternateName',
 'captureSpecification',
 'cycleFacility',
 'endNode',
 'fictitious',
 'formOfWay',
 'formsPartOfStreet',
 'heightingMethod',
 'matchStatus',
 'pathName',
 'sourceFID',
 'startGradeSeparation',
 'startNode',
 'surfaceType',
 'elevationInDirection']

In [17]:
display_head(df, 2)

Unnamed: 0,CaptureDate,CaptureID,FID,Geometry,changeType_source,identifier,alternateName,captureSpecification,cycleFacility,endNode,...,formOfWay,formsPartOfStreet,heightingMethod,matchStatus,pathName,sourceFID,startGradeSeparation,startNode,surfaceType,elevationInDirection
0,2020-12-31T00:00:00,3,24b3sdf4353fc220,Geo223,insert,2,,Urban,,eb7f6122-f765-4113-8b77-908934437ee96,...,Path,[4fb3afad-2c56-4946-83d1-d734508705e2],DTM,Matched,,MissingFID,0,147b7a78-04bf-4f9a-801d-61dc534a0a5,Sealed on surface,0


In [18]:
### No foldLeft in python - there is no need to use one if there would be a need to use sth like this we can call functools.reduce or itertools.accumulate

In [19]:
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .load("online-retail-dataset.csv")

In [20]:
display_head(df, 30)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047,United Kingdom


In [26]:
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("maxRecordsPerFile", 100) \
    .save('./delta')

23/06/24 19:50:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/06/24 19:50:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/06/24 19:50:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/06/24 19:50:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
23/06/24 19:50:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/06/24 19:50:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/06/24 19:50:13 WARN MemoryManager: Total allocation exceeds 95.00% 

In [38]:
from time import perf_counter
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, './delta')

t0 = perf_counter()
num_records = delta_table.toDF().count()
t1 = perf_counter()
print(f'Number of records: {num_records} time before optimization: {t1-t0 :.4f} s')

delta_table.optimize()

t0 = perf_counter()
num_records = delta_table.toDF().count()
t1 = perf_counter()
print(f'Number of records: {num_records} time after optimization: {t1-t0 :.4f} s')


Number of records: 541909 time before optimization: 0.2281 s
Number of records: 541909 time after optimization: 0.2561 s


In [39]:
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("zOrderBy", "Country") \
    .save('./delta_zordered')

23/06/24 19:59:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/06/24 19:59:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
23/06/24 19:59:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
23/06/24 19:59:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
23/06/24 19:59:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
23/06/24 19:59:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
23/06/24 19:59:56 WARN MemoryManager: Total allocation exceeds 95.0

In [46]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
delta_table.vacuum(0)

                                                                                

Deleted 83 files and directories in a total of 1 directories.


DataFrame[]