In [50]:
import os

import traceback
import time
import random

from duckdb.duckdb.typing import TIME_TZ
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import (StructType, StructField, IntegerType, StringType, BooleanType)
from pyspark.sql import functions as F

Notebook about create a sample date frame and create a comparison with and without cache, endpoint

In [51]:
def create_sample_df(spark_session, records_count, mode):
    schema = StructType([StructField('author', StringType(), False),
                         StructField('title', StringType(), False),
                         StructField('rating', IntegerType(), False),
                         StructField('year', IntegerType(), False)])
    data = [("Author" + str(i % 100), "Title" + str(i % 200), random.randint(1,5), random.randint(1995,1997)) for i in range(records_count)]
    return spark_session.createDataFrame(data, schema)

def process_df_with_mode(df, mode):
    processed_df = df
    if mode == "USE_CACHE":
        processed_df = df.cache()
    elif mode == "USE_CHECKPOINT":
        processed_df =  df.checkpoint()
    return processed_df

In [52]:
spark = SparkSession.builder.appName("Spark Cache/Checkpoint Sampler").getOrCreate()
spark.sparkContext.setCheckpointDir("/tmp")
mode = "USE_CACHE"
df = create_sample_df(spark, 500000, mode)

Sampling with time metrics

In [53]:
print("Check with mode: " + str(mode))
t0 = time.process_time()
filter_df = df.filter("rating >= 5")
filter_df.show()

# Start processing dataframe bases on mode
filter_df = process_df_with_mode(filter_df, mode)

# Start do analysis
book_count_by_author = filter_df.groupBy(F.col("author")).count()
book_count_by_author.show()

book_count_by_year = filter_df.groupBy(F.col("year")).count()
book_count_by_year.show()

t1 = time.process_time()
elapsed_time = t1 - t0
print(f"Elapsed time: {elapsed_time:.5f} seconds")

Check with mode: USE_CACHE


25/02/10 20:29:27 WARN TaskSetManager: Stage 0 contains a task of very large size (1800 KiB). The maximum recommended task size is 1000 KiB.


+--------+-------+------+----+
|  author|  title|rating|year|
+--------+-------+------+----+
| Author3| Title3|     5|1997|
| Author7| Title7|     5|1997|
| Author8| Title8|     5|1995|
| Author9| Title9|     5|1995|
|Author12|Title12|     5|1997|
|Author21|Title21|     5|1996|
|Author22|Title22|     5|1997|
|Author24|Title24|     5|1996|
|Author26|Title26|     5|1997|
|Author31|Title31|     5|1995|
|Author33|Title33|     5|1996|
|Author39|Title39|     5|1997|
|Author45|Title45|     5|1996|
|Author46|Title46|     5|1997|
|Author50|Title50|     5|1996|
|Author55|Title55|     5|1996|
|Author66|Title66|     5|1997|
|Author77|Title77|     5|1995|
|Author87|Title87|     5|1996|
|Author90|Title90|     5|1996|
+--------+-------+------+----+
only showing top 20 rows



25/02/10 20:29:28 WARN TaskSetManager: Stage 1 contains a task of very large size (1800 KiB). The maximum recommended task size is 1000 KiB.
25/02/10 20:29:28 WARN TaskSetManager: Stage 2 contains a task of very large size (1800 KiB). The maximum recommended task size is 1000 KiB.
25/02/10 20:29:28 WARN TaskSetManager: Stage 5 contains a task of very large size (1800 KiB). The maximum recommended task size is 1000 KiB.


+--------+-----+
|  author|count|
+--------+-----+
|Author41|  964|
|Author58| 1052|
|Author46|  990|
|Author45|  973|
|Author39|  961|
|Author31| 1038|
|Author68| 1030|
|Author47| 1036|
| Author6| 1014|
|Author26| 1038|
|Author37| 1005|
|Author93|  980|
|Author73| 1000|
| Author0|  997|
|Author28|  988|
|Author48|  999|
|Author77| 1023|
|Author23| 1022|
|Author16| 1032|
|Author57|  975|
+--------+-----+
only showing top 20 rows

+----+-----+
|year|count|
+----+-----+
|1997|33326|
|1996|33332|
|1995|33448|
+----+-----+

Elapsed time: 0.01085 seconds


In [54]:
spark.stop()