In [1]:
import os

import traceback
import time
import random

from duckdb.duckdb.typing import TIME_TZ
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import (StructType, StructField, IntegerType, StringType, BooleanType)
from pyspark.sql import functions as F

Notebook about create a sample date frame and create a comparison with and without cache, endpoint

In [2]:
def create_sample_df(spark_session, records_count, mode):
    schema = StructType([StructField('author', StringType(), False),
                         StructField('title', StringType(), False),
                         StructField('rating', IntegerType(), False),
                         StructField('year', IntegerType(), False)])
    data = [("Author" + str(i % 100), "Title" + str(i % 200), random.randint(1,5), random.randint(1995,1997)) for i in range(records_count)]
    return spark_session.createDataFrame(data, schema)

def process_df_with_mode(df, mode):
    processed_df = df
    if mode == "USE_CACHE":
        processed_df = df.cache()
    elif mode == "USE_CHECKPOINT":
        processed_df = df.checkpoint()
    return processed_df

In [3]:
spark = SparkSession.builder.appName("Spark Cache/Checkpoint Sampler").getOrCreate()
spark.sparkContext.setCheckpointDir("/tmp")
mode = "USE_CHECKPOINT"
df = create_sample_df(spark, 10000, mode)

25/02/16 20:07:08 WARN Utils: Your hostname, MacBook-Air-cua-Ngoc-2.local resolves to a loopback address: 127.0.0.1; using 192.168.1.7 instead (on interface en0)
25/02/16 20:07:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/16 20:07:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Sampling with time metrics

In [4]:
print("Check with mode: " + str(mode))
t0 = time.process_time()
filter_df = df.filter("rating >= 5")
filter_df.show()

# Start processing dataframe bases on mode
filter_df = process_df_with_mode(filter_df, mode)

# Start do analysis
book_count_by_author = filter_df.groupBy(F.col("author")).count()
book_count_by_author.show()

book_count_by_year = filter_df.groupBy(F.col("year")).count()
book_count_by_year.show()

t1 = time.process_time()
elapsed_time = t1 - t0
print(f"Elapsed time: {elapsed_time:.5f} seconds")

Check with mode: USE_CHECKPOINT


                                                                                

+--------+-------+------+----+
|  author|  title|rating|year|
+--------+-------+------+----+
| Author1| Title1|     5|1995|
| Author2| Title2|     5|1996|
| Author5| Title5|     5|1996|
| Author6| Title6|     5|1995|
|Author31|Title31|     5|1996|
|Author37|Title37|     5|1995|
|Author38|Title38|     5|1997|
|Author46|Title46|     5|1995|
|Author49|Title49|     5|1997|
|Author56|Title56|     5|1997|
|Author66|Title66|     5|1995|
|Author68|Title68|     5|1996|
|Author84|Title84|     5|1996|
|Author85|Title85|     5|1996|
|Author86|Title86|     5|1995|
|Author87|Title87|     5|1997|
|Author92|Title92|     5|1997|
|Author94|Title94|     5|1996|
|Author96|Title96|     5|1997|
|Author97|Title97|     5|1997|
+--------+-------+------+----+
only showing top 20 rows

+--------+-----+
|  author|count|
+--------+-----+
|Author41|   18|
|Author58|   18|
|Author46|   22|
|Author31|   21|
|Author68|   20|
|Author47|   22|
| Author6|   15|
|Author37|   19|
|Author93|   21|
|Author28|   20|
| Author0

In [5]:
spark.stop()