## Prerequisites for spark and arrow

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, StructField, StructType, DoubleType, IntegerType
import pyspark.pandas as ps
from pyspark.sql import functions as F
import pandas as pd
import numpy as np

In [2]:
# setup for local testing - comment in case of databricks
builder = SparkSession.builder.master("local[4]").appName("pandas-on-spark")
builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .config("spark.ui.enabled", "false")
# Pandas API on Spark automatically uses this Spark session with the configurations set.
spark = builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/02 09:40:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Scalar UDFS vs pandas UDFs 

see [this post](https://gist.github.com/BryanCutler/0b0c820c1beb5ffc40618c462912195f) for more

Short excurse to pandas vs python udf

In [3]:
df = spark.range(1 << 24, numPartitions=16).toDF("id") \
        .withColumn("p1", F.rand()).withColumn("p2", F.rand())

In [4]:
from math import log, exp

def scalar_func(p1, p2):
    w = 0.5
    return exp(log(p1) + log(p2) - log(w))

In [5]:
my_udf = F.udf(scalar_func, DoubleType())

result = df.withColumn("p", my_udf(F.col("p1"), F.col("p2")))

%timeit result.filter("p < 1.0").count()



4.15 s ± 22.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


                                                                                

In [6]:
def vect_func(p1, p2):
    w = 0.5
    return np.exp(np.log(p1) + np.log(p2) - np.log(w))

In [7]:
my_udf = F.pandas_udf(vect_func, DoubleType())

result = df.withColumn("p", my_udf(F.col("p1"), F.col("p2")))

%timeit result.filter("p < 1.0").count()



1.23 s ± 18.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


                                                                                

In [None]:
from math import log, exp

def scalar_func(p1, p2):
    w = 0.5
    return exp(log(p1) + log(p2) - log(w))

Executing a `transform` or `apply` statement differs for ps Series and ps DataFrames:
- For [Series](https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.Series.apply.html) the function is applied elementwise
- For [DataFrames](https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.DataFrame.transform.html) each function takes a pandas Series, and the pandas API on Spark computes the functions in a distributed manner (see (transform_apply[https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/transform_apply.html])

In [50]:
def get_max_ps(x: ps.Series) -> np.int64:
    return x.max()

In [51]:
def get_max(x: np.int64) -> np.float64:
    return np.max(x)

In [54]:
ps.Series(range(10000)).apply(get_max).head()

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [55]:
ps.DataFrame({'A': range(10000)})[['A']].apply(get_max_ps).head()

0    2499
1    4999
2    7499
3    9999
dtype: int64

How is this done with [panda udfs](https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.pandas_udf.html)?

- Note that the type hint should use pandas.Series in all cases 

In [56]:
@F.pandas_udf("int")
def get_max_pd(x: pd.Series) -> np.int64:
    return x.max()

In [59]:
sdf = spark.createDataFrame(pd.DataFrame({'A': range(100000)}))
sdf.rdd.getNumPartitions()

4

In [60]:
sdf.select(get_max_pd(F.col("A"))).show()

+-------------+
|get_max_pd(A)|
+-------------+
|        99999|
+-------------+

