# Spark Caching

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [None]:
spark = (
    SparkSession
      .builder
      .appName("Yarn Default")
      .master("yarn")
      .getOrCreate()
)

## Подготовка данных

In [None]:
schema = StructType([
    StructField("VendorID", LongType()),
    StructField("tpep_pickup_datetime", TimestampType()),
    StructField("tpep_dropoff_datetime", TimestampType()),
    StructField("passenger_count", DoubleType()),
    StructField("trip_distance", DoubleType()),
    StructField("RatecodeID", DoubleType()),
    StructField("store_and_fwd_flag", StringType()),
    StructField("PULocationID", LongType()),
    StructField("DOLocationID", LongType()),
    StructField("payment_type", LongType()),
    StructField("fare_amount", DoubleType()),
    StructField("extra", DoubleType()),
    StructField("mta_tax", DoubleType()),
    StructField("tip_amount", DoubleType()),
    StructField("tolls_amount", DoubleType()),
    StructField("improvement_surcharge", DoubleType()),
    StructField("total_amount", DoubleType()),
    StructField("congestion_surcharge", DoubleType()),
    StructField("airport_fee", DoubleType()),
])

In [None]:
! wget -O /tmp/taxi.parquet 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'

In [None]:
! scp /tmp/taxi.parquet hdfs@worker-1:/tmp

In [None]:
! ssh hdfs@worker-1 'hdfs dfs -put /tmp/taxi.parquet /user/jovyan'

In [None]:
! ssh hdfs@worker-1 'hdfs dfs -ls /user/jovyan'

In [None]:
spark \
  .read \
  .schema(schema) \
  .parquet("taxi.parquet") \
  .repartition(4, "passenger_count") \
  .write \
  .mode("overwrite") \
  .parquet("taxi")

In [None]:
taxi_df = spark.read.schema(schema).parquet("taxi")

In [None]:
def query(df):
    return (
        df.filter(df.passenger_count < 5)
            .distinct()
    )

## Оригинальный запрос

Выполним запрос:

In [None]:
result = query(taxi_df)
result.count()

In [None]:
spark.stop()

## Увеличить число воркеров

In [None]:
spark = (
    SparkSession
      .builder
      .appName("Yarn Increase Heap")
      .master("yarn")
      .config("spark.executor.instances", 3)
      .getOrCreate()
)

In [None]:
result = query(taxi_df)
result.count()

In [None]:
spark.stop()

## Опмитизация spill данных при исполнении

In [None]:
spark = (
    SparkSession
      .builder
      .appName("Yarn Eliminate Splills")
      .master("yarn")
      .config("spark.executor.instances", 3)
      .config("spark.executor.memory", "2g")
      .config("spark.memory.fraction", "0.8")
      .config("spark.memory.storrageFraction", "0.0")
      .getOrCreate()
)

In [None]:
result = query(taxi_df)
result.count()

In [None]:
spark.stop()

## Оптимизация параллелизма

In [None]:
spark = (
    SparkSession
      .builder
      .appName("Yarn Increase Parallelism")
      .master("yarn")
      .config("spark.executor.instances", 3)
      .config("spark.executor.cores", 3)
      .config("spark.executor.memory", "2g")
      .config("spark.memory.fraction", "0.7")
      .config("spark.memory.storrageFraction", "0.0")
      .getOrCreate()
)

In [None]:
result = query(taxi_df)
result.count()

In [None]:
spark.stop()

## Избавление от перекошенных (skewed) партиций

In [None]:
spark = (
    SparkSession
      .builder
      .appName("Yarn Increase Parallelism")
      .master("yarn")
      .config("spark.executor.instances", 3)
      .config("spark.executor.cores", 3)
      .config("spark.executor.memory", "2g")
      .config("spark.memory.fraction", "0.7")
      .config("spark.memory.storrageFraction", "0.0")
      .getOrCreate()
)

In [None]:
result = query(taxi_df.repartition(3))
result.count()

In [None]:
spark.stop()

## Бакеты

In [None]:
spark = (
    SparkSession
      .builder
      .appName("Yarn Bucketing")
      .master("yarn")
      .config("spark.executor.instances", 3)
      .config("spark.executor.cores", 3)
      .config("spark.executor.memory", "2g")
      .config("spark.memory.fraction", "0.7")
      .config("spark.memory.storrageFraction", "0.0")
      .config("spark.sql.warehouse.dir", "spark-warehouse")
      .getOrCreate()
)

In [None]:
taxi_df.write.mode("overwrite").bucketBy(9, "passenger_count").saveAsTable("taxi")

In [None]:
taxi = spark.table("taxi")
result = query(taxi)
result.count()