In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql import functions as F

spark = SparkSession. \
    builder. \
    appName("Spark Window UDF"). \
    master("local[4]"). \
    config("spark.sql.legacy.timeParserPolicy", "LEGACY"). \
    getOrCreate()

In [2]:
simpleData = [("James", "Sales", 3000), ("John", "ServiceDesk", 4600), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100),
                 ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                 ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)]

In [3]:
employeeDF = spark \
  .createDataFrame(simpleData) \
  .toDF("employee_name", "department", "salary")

In [4]:
employeeDF.show(10, False)

# Window functions

In [6]:
employeeDF.createOrReplaceTempView("employee")

In [7]:
# How to find second salary in employee
result_sql_df = spark.sql("""select distinct salary from (
                                select
                                    employee_name,
                                    department,
                                    salary,
                                    row_number() OVER (ORDER BY salary DESC) as row_num,
                                    rank() OVER (ORDER BY salary DESC) as rank,
                                    dense_rank() OVER (ORDER BY salary DESC) as dense_rank
                              from employee) where dense_rank = 2""")

In [8]:
result_sql_df.explain()

In [9]:
result_sql_df.show()

## Window functions in Spark DSL

In [10]:
# windowSpec = Window.partitionBy("department").orderBy(F.col("salary").desc())
windowSpec = Window.orderBy(F.col("salary").desc())

In [11]:
result_with_rank_df = employeeDF \
  .withColumn(
    "rank",
    F.rank()
        .over(windowSpec)) \
  .withColumn(
    "row_number",
    F.row_number()
        .over(windowSpec)) \
  .withColumn(
    "dense_rank",
    F.dense_rank()
        .over(windowSpec))

In [12]:
result_with_rank_df.explain()

In [13]:
result_with_rank_df.show()

In [14]:
# Использовать одну партицю опасно, никогда не пишите такой код в PRODUCTION
# Заметтье, тут COUNT(*) по всему множеству строк, что может не поместиться в память => OOM
query = """
select 
    employee_name, 
    department, 
    salary, 
    count(*) OVER () as cnt
from employee
"""

result_sql_df = spark.sql(query)

In [15]:
result_sql_df.explain()

In [16]:
result_sql_df.show()

In [27]:
# Spark DSL, SinglePartition
single_part_df_1 = employeeDF.\
    withColumn("rank", F.count(F.col("employee_name")).over(windowSpec))

In [28]:
single_part_df_1.explain()

In [29]:
single_part_df_1.show()

In [30]:
# Правильный способ

cnt = employeeDF.count()

result_with_count_df = employeeDF \
  .withColumn(
    "count",
    F.lit(cnt)
  )

In [31]:
result_with_count_df.explain()

In [32]:
result_with_count_df.show()

In [33]:
# Не используйте row_number без партиционирования => Exchange SinglePartition

single_part_df_2 = employeeDF \
  .withColumn("row_num", F.row_number().over(windowSpec))

In [34]:
single_part_df_2.explain()

In [35]:
single_part_df_2.show()

In [37]:
# Способ получше

result_with_uniq_num = employeeDF \
  .withColumn("row_num", F.monotonically_increasing_id())

print("Колличество партиций: ", employeeDF.rdd.getNumPartitions())

In [38]:
result_with_uniq_num.explain()

In [39]:
result_with_uniq_num.show()

# UDF, UDAF user_define_functions

In [40]:
# Определить UDF
lambda_is_between_1972_and_1974 = lambda year: 1972 <= year <= 1974

In [41]:
# Зарегистрировать UDF либо через udf функцию
is_between_1972_and_1974 = udf(lambda_is_between_1972_and_1974)

In [42]:
# либо через spark context
spark.udf.register("isBetween72And74", lambda_is_between_1972_and_1974)

In [44]:
cars = spark.read. \
    option("inferSchema", "true").\
    json("data/cars")

cars.show(5)

In [47]:
cars_72_74_df = cars \
  .selectExpr("Year", "isBetween72And74(CAST(SUBSTR(YEAR, 0, 4) as int)) as between_72_and_74") \
  .filter("Year is not null") \
  .distinct()

In [48]:
cars_72_74_df.explain()

In [50]:
cars_72_74_df.show()

In [51]:
cars_72_74_df = cars \
  .select(cars.Year, is_between_1972_and_1974(cars.Year.substr(0, 4).cast("int"))) \
  .filter(F.col("Year").isNotNull()) \
  .distinct()

In [52]:
cars_72_74_df.explain()

In [53]:
cars_72_74_df.show()

In [56]:
cars.createOrReplaceTempView("cars")

query = """
SELECT DISTINCT Name
     , Year
  FROM (
    SELECT CAST(SUBSTR(YEAR, 0, 4) as int) Y
         , *
      FROM cars
     WHERE Year IS NOT NULL
    )
 WHERE isBetween72And74(Y) = true
"""

cars_72_74_df = spark.sql(query)

In [57]:
cars_72_74_df.explain()

In [58]:
cars_72_74_df.show(5, False)