In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql import functions as F

spark = SparkSession \
    .builder \
    .appName("Spark Pandas UDF") \
    .master("local[4]") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

# .config("spark.sql.execution.arrow.maxRecordsPerBatch", "10000") \

In [2]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
df = spark.range(1 << 10).toDF("id").withColumn("x", rand())

%time pandas_df = df.toPandas()

CPU times: user 406 ms, sys: 96 ms, total: 502 ms
Wall time: 2.67 s


In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
df = spark.range(1 << 10).toDF("id").withColumn("x", rand())

%time pandas_df = df.toPandas()

CPU times: user 876 µs, sys: 17.5 ms, total: 18.3 ms
Wall time: 675 ms


In [4]:
import numpy as np
import pandas as pd
import pyarrow as pa

In [5]:
print("Arrow включен:", spark.conf.get("spark.sql.execution.arrow.pyspark.enabled"))

Arrow включен: true


In [6]:
pdf = pd.DataFrame({
    'temp': [-17, np.nan, 4],
    'month': ['january', 'february', 'march'],
    'full': [True, False, True]
    }
)
table = pa.Table.from_pandas(pdf)
table

pyarrow.Table
temp: double
month: string
full: bool
----
temp: [[-17,null,4]]
month: [["january","february","march"]]
full: [[true,false,true]]

In [7]:
pdf = table.to_pandas()
pdf.head()

Unnamed: 0,temp,month,full
0,-17.0,january,True
1,,february,False
2,4.0,march,True


In [8]:
emps = [("James", "Sales", 3000), ("John", "ServiceDesk", 4600), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100),
                 ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                 ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)]
emps = [x for x in emps for _ in range(500)]
np.random.shuffle(emps)

In [9]:
spark \
  .createDataFrame(emps) \
  .toDF("employee_name", "department", "salary") \
  .write \
  .mode("overwrite") \
  .parquet("../out/emps")

In [10]:
import pyarrow.parquet as pq

empsTable = pq.ParquetDataset("../out/emps").read()
empsTable[0:5]

pyarrow.Table
employee_name: string
department: string
salary: int64
----
employee_name: [["Michael","Kumar","James","Jen","Saif"]]
department: [["Sales","Marketing","Sales","Finance","Sales"]]
salary: [[4600,2000,3000,3900,4100]]

In [11]:
pdf = empsTable.to_pandas()
empsDf = spark.createDataFrame(pdf)
empsDf.show(5)

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|      Michael|     Sales|  4600|
|        Kumar| Marketing|  2000|
|        James|     Sales|  3000|
|          Jen|   Finance|  3900|
|         Saif|     Sales|  4100|
+-------------+----------+------+
only showing top 5 rows



In [12]:
empsPdf = empsDf.toPandas()
empsPdf.head()

Unnamed: 0,employee_name,department,salary
0,Michael,Sales,4600
1,Kumar,Marketing,2000
2,James,Sales,3000
3,Jen,Finance,3900
4,Saif,Sales,4100


# Pandas UDFs

In [13]:
empsDf = spark.read.parquet("../out/emps")

In [28]:
@F.pandas_udf("long")
def doubleSalary(salary: pd.Series) -> pd.Series:
    return salary * 2

spark.udf.register("doubleSalary", doubleSalary)

<function __main__.doubleSalary(salary: pandas.core.series.Series) -> pandas.core.series.Series>

In [17]:
empsDf \
  .withColumn("new_salary", doubleSalary(col("salary"))) \
  .show(5)

+-------------+----------+------+----------+
|employee_name|department|salary|new_salary|
+-------------+----------+------+----------+
|         Saif|     Sales|  4100|      8200|
|         Saif|     Sales|  4100|      8200|
|      Michael|     Sales|  4600|      9200|
|       Robert|     Sales|  4100|      8200|
|       Robert|     Sales|  4100|      8200|
+-------------+----------+------+----------+
only showing top 5 rows



In [18]:
empsDf.createOrReplaceTempView("emps")
spark.sql("select employee_name, department, salary, doubleSalary(salary) new_salary from emps") \
  .show(5)

+-------------+----------+------+----------+
|employee_name|department|salary|new_salary|
+-------------+----------+------+----------+
|         Saif|     Sales|  4100|      8200|
|         Saif|     Sales|  4100|      8200|
|      Michael|     Sales|  4600|      9200|
|       Robert|     Sales|  4100|      8200|
|       Robert|     Sales|  4100|      8200|
+-------------+----------+------+----------+
only showing top 5 rows



In [19]:
@pandas_udf("double")
def mean_udf(v: pd.Series) -> float:
    return v.mean()

In [20]:
empsDf.select(mean_udf(empsDf["salary"])).show()

+-----------------+
| mean_udf(salary)|
+-----------------+
|3509.090909090909|
+-----------------+



In [27]:
empsDf \
  .groupby("department") \
  .agg(mean_udf(empsDf['salary'])) \
  .show()

+-----------+----------------+
| department|mean_udf(salary)|
+-----------+----------------+
|    Finance|          3400.0|
|  Marketing|          2500.0|
|      Sales|          3760.0|
|ServiceDesk|          4600.0|
+-----------+----------------+



In [22]:
windowSpec = Window.partitionBy('department')

empsDf \
  .withColumn(
    'mean_v',
    mean_udf(empsDf['salary'])
      .over(windowSpec)) \
  .show(5)

+-------------+----------+------+------+
|employee_name|department|salary|mean_v|
+-------------+----------+------+------+
|        Scott|   Finance|  3300|3400.0|
|        Maria|   Finance|  3000|3400.0|
|        Maria|   Finance|  3000|3400.0|
|        Scott|   Finance|  3300|3400.0|
|          Jen|   Finance|  3900|3400.0|
+-------------+----------+------+------+
only showing top 5 rows



In [38]:
def subtract_mean(pdf: pd.DataFrame) -> pd.DataFrame:
    sal_diff = pdf.salary
    return pdf.assign(sal_diff=sal_diff - sal_diff.mean())

In [39]:
empsDf \
  .groupby("department") \
  .applyInPandas(
    subtract_mean,
    schema="employee_name string, department string, salary double, sal_diff double") \
  .show(5)

+-------------+----------+------+--------+
|employee_name|department|salary|sal_diff|
+-------------+----------+------+--------+
|        Scott|   Finance|3300.0|  -100.0|
|        Maria|   Finance|3000.0|  -400.0|
|        Maria|   Finance|3000.0|  -400.0|
|        Scott|   Finance|3300.0|  -100.0|
|          Jen|   Finance|3900.0|   500.0|
+-------------+----------+------+--------+
only showing top 5 rows

