In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages io.dropwizard.metrics:metrics-servlets:4.2.0 pyspark-shell'

from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession. \
    builder. \
    config("spark.sql.autoBroadcastJoinThreshold", 0). \
    appName("Spark Optimizations"). \
    master("local[4]"). \
    getOrCreate()

In [2]:
movies_df = spark.read \
  .format("json") \
  .option("inferSchema", "true") \
  .load("data/movies") \
  .repartition(200)

In [3]:
# - what's wrong with a SinglePartition?
# - how to add column with row_num() and count()?
# read.parquet.count use schema

whole_dataset = Window \
  .partitionBy() \
  .orderBy(col("Title").asc_nulls_last())

In [4]:
single_part_df = movies_df.select(
    col("Title"),
    row_number()
      .over(whole_dataset)
      .alias("row_num")
)
single_part_df.explain()

In [5]:
single_part_df.show(10, False)

In [31]:
non_single_part_df = movies_df \
  .select(
    col("Title"),
    monotonically_increasing_id().alias("row_num")
  )
non_single_part_df.explain()
# single_part_df.sample(0.1).show()

In [49]:
# первая партиция стартует с 0
# вторая с 8589934592 или 1<<33
# все последующие через равные промежутки через 1<<33
non_single_part_df.show(17, False)

In [51]:
# 2
# How to read all data from cache?
# Partial caching - cashing only parts which were calculated by some action. That is the cause that part of data
# was from cache the other from source.

partition_of_100_df = spark.range(0, 10000, 1, 100) \
  .cache() # == .persist(StorageLevel.MEMORY_AND_DISK)


In [56]:
# use only one partition, use only one partition FRACTION CACHE 1% - http://localhost:4040/storage/
# consistence can be uncorrected USE .count to put all data to cache
# deserialized - as Java object, serialized - as Array[Byte]

partition_of_100_df.count()

In [58]:
partition_of_100_df.show(10)

In [59]:
# show data on local disk and disk spil
# InMemoryRelation - load data to cache

partition_of_100_df.explain()
# InMemoryTableScn - load data to cache


In [None]:
# 3 Coalesce vs repartition


# 4 Join optimisation

In [60]:
# dataframe of facts

crime_facts = spark \
    .read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("data/crimes/crime.csv")

crime_facts \
  .cache() \
  .count()

In [64]:
# Catalyst optimiser

# Передвинуть фильтр ближе к источнику данных
grouped_crime_df = crime_facts \
  .groupBy(col("OFFENSE_CODE")) \
  .count() \
  .filter(col("OFFENSE_CODE") == 1402)

grouped_crime_df.explain(True)
grouped_crime_df.show()

In [67]:
# Небольшая таблица измерений

offense_сodes = spark.\
    read.\
    option("header", "true").\
    option("inferSchema", "true").\
    csv("data/crimes/offense_codes.csv")

offense_сodes.show(5, False)

In [72]:
# Sort merge join по умолчанию
rob_sort_merge_df = crime_facts.\
    join(offense_сodes, col("CODE") == col("OFFENSE_CODE")).\
    filter(col("NAME").startswith("ROBBERY")).\
    groupBy(col("NAME")).\
    count().\
    orderBy(col("count").desc())

rob_sort_merge_df.explain(True)

In [75]:
rob_sort_merge_df.show(5, False)

In [76]:
# Сравним с Broadcast Join

rob_broadcast_df = crime_facts.\
    join(broadcast(offense_сodes), col("CODE") == col("OFFENSE_CODE")).\
    filter(col("NAME").startswith("ROBBERY")).\
    groupBy(col("NAME")).\
    count().\
    orderBy(col("count").desc())

rob_broadcast_df.explain(True)

In [78]:
rob_broadcast_df.show(5, False)

# Shared variables

In [97]:
# Spark гарантирует, что аккумуляторы будут обновлены только 1 раз внутри каждого action
# Spark не гарантирует, что аккумуляторы будут обновлены только 1 раз внутри transformations

sc = spark.sparkContext

# аккумуляторы заполняем на EXECUTORS, читаем на DRIVER
accum = sc.accumulator(0)

# broadcast заполняем на DRIVER, читаем на EXECUTORS
broadcastVar = sc.broadcast([10, 20, 30])
broadcastVar.value

sum = __builtins__.sum

def my_mapper(x):
    accum.add(1) # так лучше не делать
    return x + sum(broadcastVar.value)

res = sc.parallelize([1, 2, 3, 4]) \
  .map(my_mapper) \
  .foreach(lambda x: accum.add(x))

accum.value