In [167]:
from time import sleep

from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
from pyspark.sql.types import *

spark = SparkSession.builder.master("local").getOrCreate()
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 1000000) # 100Kb
sc = spark.sparkContext

### Как создать DataFrame

In [168]:
first_df = spark.read \
  .format("json") \
  .option("inferSchema", "true") \
  .load("data/cars")

In [169]:
first_df.printSchema()

In [170]:
first_df.show(10, False)

In [171]:
# определение схемы вручную 
cars_schema = StructType([
    StructField("Name", StringType()),
    StructField("Acceleration", DoubleType()),
    StructField("Cylinders", IntegerType()),
    StructField("Displacement", DoubleType()),
    StructField("Horsepower", LongType()),
    StructField("Miles_per_Gallon", DoubleType()),
    StructField("Origin", StringType()),
    StructField("Weight_in_lbs", LongType()),
    StructField("Year", StringType()),
])

In [172]:
cars_df = spark.read \
  .format("json") \
  .schema(cars_schema) \
  .load("data/cars")

In [173]:
cars_df \
  .filter(col("Cylinders") >= 7) \
  .show(5, False)

In [174]:
cars_df \
  .filter(col("Cylinders") >= 7) \
  .count()

In [175]:
# Сравните время исполнения RDD
cars_df \
  .rdd \
  .filter(lambda row : row["Cylinders"] >= 7) \
  .count()

In [176]:
# Как сравнить схемы двух DataFrame

first_df_inferSchema = spark.read. \
    format("json"). \
    option("inferSchema", "true"). \
    load("data/cars")

first_df_inferSchema.printSchema()
cars_df.printSchema()
print("Схемы равны?", set(first_df_inferSchema.schema) == set(cars_df.schema))

In [177]:
inferred_schema = set(first_df_inferSchema.schema)
actual_schema = set(cars_df.schema)

print('поля только в вычисленной схеме:', set(inferred_schema) - set(actual_schema))
print('поля только в заданной схеме:', set(actual_schema) - set(inferred_schema))

### Catalyst Optimiser = логическая оптимизация и кодогенерация

In [178]:
# Adaptive Query Execution =
#    Coalescing Post Shuffle Partitions
#  + Converting sort-merge join to broadcast join
#  + Optimizing Skew Join
# Adaptive Query Execution
print("""AQE Configuration:
\tAQE enabled: {}
\tCoalescing Post Shuffle Partitions enabled: {}
\tConverting sort-merge join to broadcast join: {}
\tOptimizing Skew Join: {}
""".format(
    spark.conf.get("spark.sql.adaptive.enabled"),
    spark.conf.get("spark.sql.adaptive.coalescePartitions.enabled"),
    spark.conf.get("spark.sql.autoBroadcastJoinThreshold"),
    spark.conf.get("spark.sql.adaptive.skewJoin.enabled"))
)

In [179]:
most_powered_df = cars_df. \
    where(cars_df.Cylinders > 4). \
    withColumn("new", expr("Acceleration + 10")). \
    sort(cars_df.Horsepower.desc(), cars_df.Acceleration.asc())

most_powered_df.explain(True)

In [180]:
for_plan_df = most_powered_df \
  .join(cars_df, "Name")

In [181]:
for_plan_df.explain()

In [182]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 10)
spark.conf.get("spark.sql.autoBroadcastJoinThreshold")

In [183]:
# План уже построился, broadcast join будет использоваться
for_plan_df.explain()

In [184]:
# Для другой join операции (той же самой) broadcast join больше недоступен
most_powered_df \
  .join(cars_df, "Name") \
  .explain()

In [185]:
# можно явно использовать broadcast join
from pyspark.sql.functions import broadcast

most_powered_df \
  .join(broadcast(cars_df), "Name") \
  .explain()

In [187]:
most_powered_df.show(10)