In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|dob  |gender|salary|
+----------+-----------+---------+-----+------+------+
|James     |           |Smith    |36636|M     |60000 |
|Michael   |Rose       |         |40288|M     |70000 |
|Robert    |           |Williams |42114|      |400000|
|Maria     |Anne       |Jones    |39192|F     |500000|
|Jen       |Mary       |Brown    |     |F     |0     |
+----------+-----------+---------+-----+------+------+



In [None]:
#use when otherwise

In [2]:
from pyspark.sql.functions import col, when
df2 = df.withColumn("new_gender", when(col("gender") == "M","Male")
                                 .when(col("gender") == "F","Female")
                                 .otherwise("Unknown"))
df2.show(truncate=False)

df22=df.select(col("*"), when(col("gender") == "M","Male")
      .when(col("gender") == "F","Female")
      .otherwise("Unknown").alias("new_gender")).show(truncate=False)


+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|dob  |gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|James     |           |Smith    |36636|M     |60000 |Male      |
|Michael   |Rose       |         |40288|M     |70000 |Male      |
|Robert    |           |Williams |42114|      |400000|Unknown   |
|Maria     |Anne       |Jones    |39192|F     |500000|Female    |
|Jen       |Mary       |Brown    |     |F     |0     |Female    |
+----------+-----------+---------+-----+------+------+----------+

+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|dob  |gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|James     |           |Smith    |36636|M     |60000 |Male      |
|Michael   |Rose       |         |40288|M     |70000 |Male      |
|Robert    |           |Williams |42114|      |400000|Unknown   |
|Maria   

In [None]:
#use case when

In [3]:
from pyspark.sql.functions import expr
df3 = df.withColumn("new_gender", expr("case when gender = 'M' then 'Male' " + 
                       "when gender = 'F' then 'Female' " +
                       "else 'Unknown' end"))
df3.show(truncate=False)


+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|dob  |gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|James     |           |Smith    |36636|M     |60000 |Male      |
|Michael   |Rose       |         |40288|M     |70000 |Male      |
|Robert    |           |Williams |42114|      |400000|Unknown   |
|Maria     |Anne       |Jones    |39192|F     |500000|Female    |
|Jen       |Mary       |Brown    |     |F     |0     |Female    |
+----------+-----------+---------+-----+------+------+----------+

