# Spark Caching

In [1]:
# https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = (
    SparkSession
      .builder
      .appName("Yarn Default")
      .master("yarn")
      .getOrCreate()
)

In [7]:
spark = (
    SparkSession
      .builder
      .appName("Caching")
      .master("yarn")
      .config("spark.yarn.am.memoryOverhead", 384)
      .config("spark.driver.memory", "1000m")
      .config("spark.driver.extraJavaOptions", "")
      .config("spark.executor.instances", "3")
      .config("spark.executor.cores", 4)
      .config("spark.executor.memory", "1000m")
      .config("spark.executor.extraJavaOptions", "-XX:NewSize=950M -XX:+PrintGCDetails -XX:+PrintGCTimeStamps")
      .getOrCreate()
)

## Подготовка данных

In [3]:
schema = StructType([
    StructField("VendorID", LongType()),
    StructField("tpep_pickup_datetime", TimestampType()),
    StructField("tpep_dropoff_datetime", TimestampType()),
    StructField("passenger_count", DoubleType()),
    StructField("trip_distance", DoubleType()),
    StructField("RatecodeID", DoubleType()),
    StructField("store_and_fwd_flag", StringType()),
    StructField("PULocationID", LongType()),
    StructField("DOLocationID", LongType()),
    StructField("payment_type", LongType()),
    StructField("fare_amount", DoubleType()),
    StructField("extra", DoubleType()),
    StructField("mta_tax", DoubleType()),
    StructField("tip_amount", DoubleType()),
    StructField("tolls_amount", DoubleType()),
    StructField("improvement_surcharge", DoubleType()),
    StructField("total_amount", DoubleType()),
    StructField("congestion_surcharge", DoubleType()),
    StructField("airport_fee", DoubleType()),
])

In [4]:
! wget -O /tmp/taxi.parquet 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'

--2023-04-10 07:00:49--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 52.85.114.39, 52.85.114.180, 52.85.114.114, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|52.85.114.39|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47673370 (45M) [application/x-www-form-urlencoded]
Saving to: ‘/tmp/taxi.parquet’


2023-04-10 07:00:56 (7.66 MB/s) - ‘/tmp/taxi.parquet’ saved [47673370/47673370]



In [5]:
! scp /tmp/taxi.parquet hdfs@worker-1:/tmp

taxi.parquet                                  100%   45MB  88.4MB/s   00:00    


In [6]:
! ssh hdfs@worker-1 'hdfs dfs -put /tmp/taxi.parquet /user/jovyan'

In [7]:
! ssh hdfs@worker-1 'hdfs dfs -ls /user/jovyan'

Found 2 items
drwxr-xr-x   - jovyan supergroup          0 2023-04-10 07:00 /user/jovyan/.sparkStaging
-rw-r--r--   3 hdfs   supergroup   47673370 2023-04-10 07:01 /user/jovyan/taxi.parquet


In [9]:
spark.read.schema(schema).parquet("taxi.parquet").repartition(4, "passenger_count").write.mode("overwrite").parquet("taxi")

In [10]:
taxi_df = spark.read.schema(schema).parquet("taxi")

In [11]:
taxi_df.groupby(F.spark_partition_id()).count().show()

+--------------------+-------+
|SPARK_PARTITION_ID()|  count|
+--------------------+-------+
|                   0|2493241|
|                   1| 573525|
+--------------------+-------+



# Особенности кэширования

Закэшируем запрос:

In [12]:
result = (
    taxi_df
        .filter(taxi_df.passenger_count < 5)
        #.distinct()
)

In [13]:
result.collect()[:5]

[Row(VendorID=2, tpep_pickup_datetime=datetime.datetime(2023, 1, 1, 0, 32, 10), tpep_dropoff_datetime=datetime.datetime(2023, 1, 1, 0, 40, 36), passenger_count=1.0, trip_distance=0.97, RatecodeID=1.0, store_and_fwd_flag='N', PULocationID=161, DOLocationID=141, payment_type=2, fare_amount=9.3, extra=1.0, mta_tax=0.5, tip_amount=0.0, tolls_amount=0.0, improvement_surcharge=1.0, total_amount=14.3, congestion_surcharge=2.5, airport_fee=0.0),
 Row(VendorID=2, tpep_pickup_datetime=datetime.datetime(2023, 1, 1, 0, 55, 8), tpep_dropoff_datetime=datetime.datetime(2023, 1, 1, 1, 1, 27), passenger_count=1.0, trip_distance=1.1, RatecodeID=1.0, store_and_fwd_flag='N', PULocationID=43, DOLocationID=237, payment_type=1, fare_amount=7.9, extra=1.0, mta_tax=0.5, tip_amount=4.0, tolls_amount=0.0, improvement_surcharge=1.0, total_amount=16.9, congestion_surcharge=2.5, airport_fee=0.0),
 Row(VendorID=2, tpep_pickup_datetime=datetime.datetime(2023, 1, 1, 0, 25, 4), tpep_dropoff_datetime=datetime.datetime(2

In [9]:
taxi_df.groupby(taxi_df.passenger_count) \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+---------------+-------+
|passenger_count|  count|
+---------------+-------+
|            1.0|2261400|
|            2.0| 451536|
|            3.0| 106353|
|           null|  71743|
|            4.0|  53745|
|            0.0|  51164|
|            5.0|  42681|
|            6.0|  28124|
|            8.0|     13|
|            7.0|      6|
|            9.0|      1|
+---------------+-------+



In [3]:
spark.stop()

In [None]:
spark = (
    SparkSession
      .builder
      .appName("Yarn Increase Heap")
      .master("yarn")
      .config("spark.yarn.am.memoryOverhead", 384)
      .config("spark.executor.instances", 3)
      .config("spark.executor.cores", 4)
      .config("spark.executor.memory", "2g")
      .config("spark.memory.storrageFraction", "0.1")
      .config("spark.executor.extraJavaOptions", "-XX:NewSize=950M -XX:+PrintGCDetails -XX:+PrintGCTimeStamps")
      .getOrCreate()
)