# Cache & Persist in RDD

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName('app').getOrCreate()
sc = spark.sparkContext
rdd = sc.textFile('customer.csv')

In [11]:
import timeit
start = timeit.default_timer()
rdd.count()
rdd.min()
rdd.max()
rdd.collect()
end = timeit.default_timer()
print("elapsed time:{}".format(end-start))

elapsed time:5.635955899953842


In [35]:
rdd.cache()

customer.csv MapPartitionsRDD[5] at textFile at NativeMethodAccessorImpl.java:0

In [13]:
import timeit
start = timeit.default_timer()
rdd.count()
rdd.min()
rdd.max()
rdd.collect()
end = timeit.default_timer()
print("elapsed time:{}".format(end-start))

elapsed time:5.440546400030144


In [34]:
rdd.unpersist()

customer.csv MapPartitionsRDD[5] at textFile at NativeMethodAccessorImpl.java:0

In [25]:
import pyspark
rdd.unpersist()
rdd.persist(pyspark.StorageLevel.MEMORY_AND_DISK)

customer.csv MapPartitionsRDD[5] at textFile at NativeMethodAccessorImpl.java:0

# Cache() & Persist() & unPersist() in DataFrames

In [70]:
df = spark.read.options(header=True,inferSchema=True,delimiter='|').csv("uspopulation.csv")
import timeit
start = timeit.default_timer()
df = df.groupBy("City").sum("2019_estimate")
df.collect()
df.show()
end = timeit.default_timer()
print("elapsed time:{}".format(end-start))

+----------------+------------------+
|            City|sum(2019_estimate)|
+----------------+------------------+
|           Tyler|            106985|
|           Tempe|            195805|
|          Corona|            169868|
|     Springfield|            114230|
|        Thornton|            141464|
|         Phoenix|           1680992|
|        McKinney|            199177|
|    Fort Collins|            170243|
|         Anaheim|            350365|
|        Temecula|            114761|
|          Dallas|           1343573|
|San Francisco[g]|            881549|
|         Oakland|            433031|
|          Laredo|            262491|
|       Oceanside|            175742|
|      Scottsdale|            258069|
|      Naperville|            148449|
|         Fontana|            214547|
|     San Antonio|           1547253|
|     Bakersfield|            384145|
+----------------+------------------+
only showing top 20 rows

elapsed time:4.200958099914715


In [71]:
df.cache()

DataFrame[City: string, sum(2019_estimate): bigint]

In [81]:
df = spark.read.options(header=True,inferSchema=True,delimiter='|').csv("uspopulation.csv")
import timeit
start = timeit.default_timer()
df = df.groupBy("City").sum("2019_estimate")
df.collect()
df.show()
end = timeit.default_timer()
print("elapsed time:{}".format(end-start))

+----------------+------------------+
|            City|sum(2019_estimate)|
+----------------+------------------+
|           Tyler|            106985|
|           Tempe|            195805|
|          Corona|            169868|
|     Springfield|            114230|
|        Thornton|            141464|
|         Phoenix|           1680992|
|        McKinney|            199177|
|    Fort Collins|            170243|
|         Anaheim|            350365|
|        Temecula|            114761|
|          Dallas|           1343573|
|San Francisco[g]|            881549|
|         Oakland|            433031|
|          Laredo|            262491|
|       Oceanside|            175742|
|      Scottsdale|            258069|
|      Naperville|            148449|
|         Fontana|            214547|
|     San Antonio|           1547253|
|     Bakersfield|            384145|
+----------------+------------------+
only showing top 20 rows

elapsed time:1.4182319999672472


In [75]:
df.unpersist()

DataFrame[City: string, sum(2019_estimate): bigint]

In [66]:
import pyspark

df.persist(pyspark.StorageLevel.MEMORY_AND_DISK)

DataFrame[City: string, sum(2019_estimate): bigint]