In [1]:
"""
Author: Matt Martin
Date: 2/7/24
Desc: demonstrates how to register a udf in spark so you can use it in sparksql
"""

## create the spark connection/instance
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("sequence") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.instances", 10) \
    .getOrCreate()

24/02/07 07:19:15 WARN Utils: Your hostname, MattMBP.local resolves to a loopback address: 127.0.0.1; using 172.20.3.16 instead (on interface en0)
24/02/07 07:19:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/07 07:19:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [3]:
@udf(returnType=IntegerType())
def add_some_stuff(x, y) -> int:
    return x+y



In [6]:
spark.udf.register('udf_add_some_stuff',add_some_stuff)

<pyspark.sql.udf.UserDefinedFunction at 0x1133d41d0>

In [4]:
data = [{'x':1, 'y':3}, {'x':5, 'y':2}]
df = spark.createDataFrame(data)
df.createOrReplaceTempView("numbers")

In [7]:
sql = """
    select x, y, udf_add_some_stuff(x,y) as res
    from numbers
"""
spark.sql(sql).show()

                                                                                

+---+---+---+
|  x|  y|res|
+---+---+---+
|  1|  3|  4|
|  5|  2|  7|
+---+---+---+

