In [0]:

emp_df = spark.createDataFrame(
    [
        (1,'manish',26,20000,'india','IT'),
        (2,'rahul',None,40000,'germany','engineering'),
        (3,'pawan',12,60000,'india','sales'),
        (4,'roshini',44,None,'uk','engineering'),
        (5,'raushan',35,70000,'india','sales'),
        (6,None,29,200000,'uk','IT'),
        (7,'adam',37,65000,'us','IT'),
        (8,'chris',16,40000,'us','sales'),
        (None,None,None,None,None,None),
        (7,'adam',37,65000,'us','IT')
    ], ["id", "name", "age", "salary", "country", "dept"])

emp_df.show()

+----+-------+----+------+-------+-----------+
|  id|   name| age|salary|country|       dept|
+----+-------+----+------+-------+-----------+
|   1| manish|  26| 20000|  india|         IT|
|   2|  rahul|null| 40000|germany|engineering|
|   3|  pawan|  12| 60000|  india|      sales|
|   4|roshini|  44|  null|     uk|engineering|
|   5|raushan|  35| 70000|  india|      sales|
|   6|   null|  29|200000|     uk|         IT|
|   7|   adam|  37| 65000|     us|         IT|
|   8|  chris|  16| 40000|     us|      sales|
|null|   null|null|  null|   null|       null|
|   7|   adam|  37| 65000|     us|         IT|
+----+-------+----+------+-------+-----------+



In [0]:
from pyspark.sql.functions import *

emp_df.withColumn("adult", when(col("age") < 18, "No").when(col("age") > 18, "Yes").otherwise("NoValue"))\
    .show()

+----+-------+----+------+-------+-----------+-------+
|  id|   name| age|salary|country|       dept|  adult|
+----+-------+----+------+-------+-----------+-------+
|   1| manish|  26| 20000|  india|         IT|    Yes|
|   2|  rahul|null| 40000|germany|engineering|NoValue|
|   3|  pawan|  12| 60000|  india|      sales|     No|
|   4|roshini|  44|  null|     uk|engineering|    Yes|
|   5|raushan|  35| 70000|  india|      sales|    Yes|
|   6|   null|  29|200000|     uk|         IT|    Yes|
|   7|   adam|  37| 65000|     us|         IT|    Yes|
|   8|  chris|  16| 40000|     us|      sales|     No|
|null|   null|null|  null|   null|       null|NoValue|
|   7|   adam|  37| 65000|     us|         IT|    Yes|
+----+-------+----+------+-------+-----------+-------+



In [0]:
# Set default 18 when age=null
emp_df.withColumn("age", when(col("age").isNull(), lit(18)).otherwise(col("age")))\
    .withColumn("adult", when(col("age") > 18, "Yes").otherwise("No")).show()

+----+-------+---+------+-------+-----------+-----+
|  id|   name|age|salary|country|       dept|adult|
+----+-------+---+------+-------+-----------+-----+
|   1| manish| 26| 20000|  india|         IT|  Yes|
|   2|  rahul| 18| 40000|germany|engineering|   No|
|   3|  pawan| 12| 60000|  india|      sales|   No|
|   4|roshini| 44|  null|     uk|engineering|  Yes|
|   5|raushan| 35| 70000|  india|      sales|  Yes|
|   6|   null| 29|200000|     uk|         IT|  Yes|
|   7|   adam| 37| 65000|     us|         IT|  Yes|
|   8|  chris| 16| 40000|     us|      sales|   No|
|null|   null| 18|  null|   null|       null|   No|
|   7|   adam| 37| 65000|     us|         IT|  Yes|
+----+-------+---+------+-------+-----------+-----+



In [0]:
emp_df.withColumn("age_category", when((col("age")>0) & (col("age")<18), "Minor"
                                       ).when((col("age")>18) & (col("age")<30), "Minor")).show()

+----+-------+----+------+-------+-----------+------------+
|  id|   name| age|salary|country|       dept|age_category|
+----+-------+----+------+-------+-----------+------------+
|   1| manish|  26| 20000|  india|         IT|       Minor|
|   2|  rahul|null| 40000|germany|engineering|        null|
|   3|  pawan|  12| 60000|  india|      sales|       Minor|
|   4|roshini|  44|  null|     uk|engineering|        null|
|   5|raushan|  35| 70000|  india|      sales|        null|
|   6|   null|  29|200000|     uk|         IT|       Minor|
|   7|   adam|  37| 65000|     us|         IT|        null|
|   8|  chris|  16| 40000|     us|      sales|       Minor|
|null|   null|null|  null|   null|       null|        null|
|   7|   adam|  37| 65000|     us|         IT|        null|
+----+-------+----+------+-------+-----------+------------+



In [0]:
emp_df.createOrReplaceTempView("emp_tbl")

spark.sql("""
          select 
            *,
            case when age>0 and age<18 then 'major'
                 when age>18 and age<30 then 'minor'
                 when age is null then 'major'
            end as age_category 
          from emp_tbl 
          """).show()


+----+-------+----+------+-------+-----------+------------+
|  id|   name| age|salary|country|       dept|age_category|
+----+-------+----+------+-------+-----------+------------+
|   1| manish|  26| 20000|  india|         IT|       minor|
|   2|  rahul|null| 40000|germany|engineering|       major|
|   3|  pawan|  12| 60000|  india|      sales|       major|
|   4|roshini|  44|  null|     uk|engineering|        null|
|   5|raushan|  35| 70000|  india|      sales|        null|
|   6|   null|  29|200000|     uk|         IT|       minor|
|   7|   adam|  37| 65000|     us|         IT|        null|
|   8|  chris|  16| 40000|     us|      sales|       major|
|null|   null|null|  null|   null|       null|       major|
|   7|   adam|  37| 65000|     us|         IT|        null|
+----+-------+----+------+-------+-----------+------------+

