# 5 Spark SQL and Dataframes: Interacting with External Data Sources

In [None]:
from pyspark.sql import SparkSession

spark: SparkSession = SparkSession.builder.appName("ch5").getOrCreate()

Other than `pysparck.sql.functions`, pyspark users can define their own functions called "user defined function (udf)". We have added `null` check to the function. Same logic is implemented in SQL query with `case` statement.

In [None]:
from pyspark.sql.types import LongType


def cubed(s):
    if s is None:
        return None
    return s**3


spark.udf.register("cubed", cubed, LongType())

In [None]:
from pyspark.sql import types


df1 = spark.range(10)
schema = types.StructType([types.StructField("id", types.IntegerType(), True)])
df2 = spark.createDataFrame(data=[[21], [None], [None]], schema=schema)

df = df1.union(df2)

df.createOrReplaceTempView("udf_test")

spark.sql("select id, cubed(id) AS id_cubed from udf_test").show(truncate=False)
query = """ 
    select id,
        CASE 
            when id is NULL then NULL
            else id * id * id
        END as id_cubed
    from udf_test
"""
spark.sql(query).show(truncate=False)

In [None]:
import pyspark.sql.functions as F

df.filter("id" > 2).show()

Regular python functions as udf forces a data exchange between python and JVM processes. Using `pandas_udf` we can prevent this from happening.

In [None]:
import pandas as pd

from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import LongType


def cubed(x: pd.Series) -> pd.Series:
    return x * x * x


cubed_pandas_udf = pandas_udf(cubed, returnType=LongType())


spark.range(100).withColumn("id_cubed", cubed_pandas_udf(col("id"))).show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window as W
import pyspark
import datetime
import json

spark = SparkSession.builder.appName("run-pyspark-code").getOrCreate()


def etl(customers, orders, products):
    customers = (
        customers.withColumn("customer_name", F.concat(F.col("first_name"), F.lit(" "), F.col("last_name")))
        .drop("first_name")
        .drop("last_name")
    )
    return customers


# customers_with_orders = customoers.merge(orders, on='customer_id', how='left')