In [1]:
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

# Declare the function and create the UDF
def multiply_func(a: pd.Series, b: pd.Series) -> pd.Series:
    return a * b

multiply = pandas_udf(multiply_func, returnType=LongType())

# The function for a pandas_udf should be able to execute with local pandas data
x = pd.Series([1.0, 2.0, 3.0])
print(multiply_func(x, x))
# 0    1
# 1    4
# 2    9
# dtype: int64


# Create a Spark DataFrame, 'spark' is an existing SparkSession
df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))

# Execute function as a Spark vectorized UDF
df1 = df.select(multiply(col("x"), col("x")))
df1.show()

0    1.0
1    4.0
2    9.0
dtype: float64
+-------------------+
|multiply_func(x, x)|
+-------------------+
|                  1|
|                  4|
|                  9|
+-------------------+



In [2]:
import pandas as pd
from pyspark.sql.functions import pandas_udf

pdf = pd.DataFrame([1, 2, 3], columns=["a"])
df = spark.createDataFrame(pdf)

@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series.
    return series + 1

df.select(pandas_plus_one(df.a)).show()

+------------------+
|pandas_plus_one(a)|
+------------------+
|                 2|
|                 3|
|                 4|
+------------------+

