In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, lit, col
from pyspark.sql.types import DoubleType
spark = SparkSession.builder.appName("SparkDFUDFs").getOrCreate()

In [6]:
df = spark.read.options(header="True", inferSchema="True").csv("OfficeData.csv")
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [7]:
def get_incr(state, salary, bonus):
    if state == "NY":
        sum = salary * 0.10
        sum += bonus * 0.05
    elif state == "CA":
        sum = salary * 0.12
        sum += bonus * 0.03
    return sum

In [8]:
incrUDF = udf(lambda x, y, z: get_incr(x, y, z), DoubleType())

In [18]:
df = df.withColumn("Increment", incrUDF(col("State"), col("Salary"), col("Bonus")))

In [19]:
df.cache()

DataFrame[employee_name: string, department: string, state: string, salary: int, age: int, bonus: int, Increment: double]

In [22]:
df.show()

+-------------+----------+-----+------+---+-----+---------+
|employee_name|department|state|salary|age|bonus|Increment|
+-------------+----------+-----+------+---+-----+---------+
|        James|     Sales|   NY| 90000| 34|10000|   9500.0|
|      Michael|     Sales|   NY| 86000| 56|20000|   9600.0|
|       Robert|     Sales|   CA| 81000| 30|23000|  10410.0|
|        Maria|   Finance|   CA| 90000| 24|23000|  11490.0|
|        Raman|   Finance|   CA| 99000| 40|24000|  12600.0|
|        Scott|   Finance|   NY| 83000| 36|19000|   9250.0|
|          Jen|   Finance|   NY| 79000| 53|15000|   8650.0|
|         Jeff| Marketing|   CA| 80000| 25|18000|  10140.0|
|        Kumar| Marketing|   NY| 91000| 50|21000|  10150.0|
+-------------+----------+-----+------+---+-----+---------+



In [None]:
spark.stop()