# Caching

In [0]:
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [0]:
%run ../PySpark/DatasetSourcePath

In [0]:
customers_file = sourcePath + "/dataset/data_skew/customers.parquet"
df_customers = spark.read.parquet(customers_file)

In [0]:
df_customers.show(5, False)

In [0]:
df_base = (
    df_customers
    .filter(F.col("city") == "boston")
    .withColumn(
        "customer_group", 
        F.when(
            F.col("age").between(20, 30), 
            F.lit("young") 
        )
        .when(
            F.col("age").between(31, 50), 
            F.lit("mid") 
        )
        .when(
            F.col("age") > 51, 
            F.lit("old") 
        )
        .otherwise(F.lit("kid"))
     )
    .select("cust_id", "name", "age", "gender", "birthday", "zip", "city", "customer_group")
)

df_base.cache() 
df_base.show(5, False)

In [0]:
df1 = (
    df_base
    .withColumn("test_column_1", F.lit("test_column_1"))
    .withColumn("birth_year", F.split("birthday", "/").getItem(2))
)

df1.explain(True)
df1.show(5, False)

In [0]:
df2 = (
    df_base
    .withColumn("test_column_2", F.lit("test_column_2"))
    .withColumn("birth_month", F.split("birthday", "/").getItem(1))
)

df2.explain(True)
df2.show(5, False)

## `StorageLevel` Types:

(As of Spark `3.4`)

- `DISK_ONLY`: CPU efficient, memory efficient, slow to access, data is serialized when stored on disk
- `DISK_ONLY_2`: disk only, replicated 2x
- `DISK_ONLY_3`: disk only, replicated 3x

- `MEMORY_AND_DISK`: spills to disk if there's no space in memory
- `MEMORY_AND_DISK_2`: memory and disk, replicated 2x
- `MEMORY_AND_DISK_DESER`(default): same as `MEMORY_AND_DISK`, deserialized in both for fast access

- `MEMORY_ONLY`: CPU efficient, memory intensive
- `MEMORY_ONLY_2`: memory only, replicated 2x - for resilience, if one executor fails

**Note**: 
- `SER` is CPU intensive, memory saving as data is compact while `DESER` is CPU efficient, memory intensive
- Size of data on disk is lesser as data is in serialized format, while deserialized in memory as JVM objects for faster access

### When to use what?
```
Storage Level    Space used  CPU time  In memory  On-disk  Serialized
---------------------------------------------------------------------
MEMORY_ONLY          High        Low       Y          N        N         
MEMORY_ONLY_SER      Low         High      Y          N        Y     
MEMORY_AND_DISK      High        Medium    Some       Some     Some  
MEMORY_AND_DISK_SER  Low         High      Some       Some     Y     
DISK_ONLY            Low         High      N          Y        Y     
```

In [0]:
df_base.unpersist()
df_base.persist(StorageLevel.DISK_ONLY_2)

df2 = (
    df_base
    .withColumn("test_column_1", F.lit("test_column_1"))
    .withColumn("birth_year", F.split("birthday", "/").getItem(2))
)

df2.show(5, False)

In [0]:
df2.explain(True)