In [91]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkApp-DF-Tranformations") \
    .getOrCreate()

In [92]:
spark

In [57]:
df = spark.read.csv("resources/in/employee/employee_data_1.csv", header=True, inferSchema=True)

In [58]:
df.show(5)

+---+------------+-----------+------+
| ID|        Name| Department|Salary|
+---+------------+-----------+------+
|  1|    John Doe|Engineering| 50000|
|  2|  Jane Smith|  Marketing| 45000|
|  3|   Jim Brown|      Sales| 40000|
|  4|Jackie White|         HR| 42000|
|  5| Emily Davis|Engineering| 60000|
+---+------------+-----------+------+
only showing top 5 rows



In [59]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)



### withColumn()

Apply transformations to a column of dataframe. It return new dataframe.

1. Convert the datatypes of columns
2. Create new columns or replace the existing columns
3. Transform entire columns with values
4. Concate the columns etc.

In [60]:
df = df.withColumn(colName="Salary", col=df.Salary.cast("float"))

In [61]:
df.show(5)

+---+------------+-----------+-------+
| ID|        Name| Department| Salary|
+---+------------+-----------+-------+
|  1|    John Doe|Engineering|50000.0|
|  2|  Jane Smith|  Marketing|45000.0|
|  3|   Jim Brown|      Sales|40000.0|
|  4|Jackie White|         HR|42000.0|
|  5| Emily Davis|Engineering|60000.0|
+---+------------+-----------+-------+
only showing top 5 rows



In [62]:
df = df.withColumn(colName="Bonus_Salary", col=df.Salary * 0.25)

In [63]:
df.show(5)

+---+------------+-----------+-------+------------+
| ID|        Name| Department| Salary|Bonus_Salary|
+---+------------+-----------+-------+------------+
|  1|    John Doe|Engineering|50000.0|     12500.0|
|  2|  Jane Smith|  Marketing|45000.0|     11250.0|
|  3|   Jim Brown|      Sales|40000.0|     10000.0|
|  4|Jackie White|         HR|42000.0|     10500.0|
|  5| Emily Davis|Engineering|60000.0|     15000.0|
+---+------------+-----------+-------+------------+
only showing top 5 rows



In [64]:
df = df.withColumn(colName="Total_Salary", col=df.Salary + df.Bonus_Salary)

In [65]:
df.show(5)

+---+------------+-----------+-------+------------+------------+
| ID|        Name| Department| Salary|Bonus_Salary|Total_Salary|
+---+------------+-----------+-------+------------+------------+
|  1|    John Doe|Engineering|50000.0|     12500.0|     62500.0|
|  2|  Jane Smith|  Marketing|45000.0|     11250.0|     56250.0|
|  3|   Jim Brown|      Sales|40000.0|     10000.0|     50000.0|
|  4|Jackie White|         HR|42000.0|     10500.0|     52500.0|
|  5| Emily Davis|Engineering|60000.0|     15000.0|     75000.0|
+---+------------+-----------+-------+------------+------------+
only showing top 5 rows



### withColumnRenamed()

Rename the column names

In [66]:
df = df.withColumnRenamed(existing="Name", new="Emp_Name")

In [67]:
df.show(3)

+---+----------+-----------+-------+------------+------------+
| ID|  Emp_Name| Department| Salary|Bonus_Salary|Total_Salary|
+---+----------+-----------+-------+------------+------------+
|  1|  John Doe|Engineering|50000.0|     12500.0|     62500.0|
|  2|Jane Smith|  Marketing|45000.0|     11250.0|     56250.0|
|  3| Jim Brown|      Sales|40000.0|     10000.0|     50000.0|
+---+----------+-----------+-------+------------+------------+
only showing top 3 rows



### drop()
Delete or remove the column

In [68]:
df = df.drop("Bonus_Salary")

In [69]:
df.show(3)

+---+----------+-----------+-------+------------+
| ID|  Emp_Name| Department| Salary|Total_Salary|
+---+----------+-----------+-------+------------+
|  1|  John Doe|Engineering|50000.0|     62500.0|
|  2|Jane Smith|  Marketing|45000.0|     56250.0|
|  3| Jim Brown|      Sales|40000.0|     50000.0|
+---+----------+-----------+-------+------------+
only showing top 3 rows



### Array Functions
1. explode()
2. split()
3. array()
4. array_contains()

In [70]:
data = [(100, ["PC", "Monitor", "Keyboard"]), (101, ["Laptop", "Speaker"]), (102, ["Mouse", "Adapter"]), (103, ["Headphone"])]

In [71]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

schema = StructType([
    StructField(name="order_id", dataType=IntegerType(), nullable=False),
    StructField(name="items", dataType=ArrayType(elementType=StringType()), nullable=False)
])

df = spark.createDataFrame(data=data, schema=schema)

df.printSchema()

root
 |-- order_id: integer (nullable = false)
 |-- items: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [74]:
df.show(truncate=False)

+--------+-----------------------+
|order_id|items                  |
+--------+-----------------------+
|100     |[PC, Monitor, Keyboard]|
|101     |[Laptop, Speaker]      |
|102     |[Mouse, Adapter]       |
|103     |[Headphone]            |
+--------+-----------------------+



#### explode

In [75]:
from pyspark.sql.functions import explode

df = df.withColumn("item", explode(df.items))
df = df.drop("items")

df.show()

+--------+---------+
|order_id|     item|
+--------+---------+
|     100|       PC|
|     100|  Monitor|
|     100| Keyboard|
|     101|   Laptop|
|     101|  Speaker|
|     102|    Mouse|
|     102|  Adapter|
|     103|Headphone|
+--------+---------+



In [78]:
data = [(100, "PC,Monitor,Keyboard"), (101, "Laptop,Speaker"), (102, "Mouse,Adapter"), (103, "Headphone")]
df = spark.createDataFrame(data=data, schema=["order_id", "items"])

df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- items: string (nullable = true)



In [None]:
df.show()

+--------+-------------------+
|order_id|              items|
+--------+-------------------+
|     100|PC,Monitor,Keyboard|
|     101|     Laptop,Speaker|
|     102|      Mouse,Adapter|
|     103|          Headphone|
+--------+-------------------+



#### split

In [81]:
from pyspark.sql.functions import split

df = df.withColumn("items", split(df.items, ","))

In [83]:
df.show(truncate=False)

+--------+-----------------------+
|order_id|items                  |
+--------+-----------------------+
|100     |[PC, Monitor, Keyboard]|
|101     |[Laptop, Speaker]      |
|102     |[Mouse, Adapter]       |
|103     |[Headphone]            |
+--------+-----------------------+



In [85]:
data = [(100, ["US", "UK", "AUS"]), (101, ["US", "UK"]), (102, ["IND", "UK"]), (103, ["IND"])]

df = spark.createDataFrame(data=data, schema=["id", "country_codes"])

df.printSchema()

root
 |-- id: long (nullable = true)
 |-- country_codes: array (nullable = true)
 |    |-- element: string (containsNull = true)



#### array_contains

In [None]:
from pyspark.sql.functions import array_contains, col

df = df.withColumn("isLocal", array_contains(col("country_codes"), "US"))
#df = df.withColumn("isLocal", array_contains(df.country_codes, "US"))

df.show()

+---+-------------+-------+
| id|country_codes|isLocal|
+---+-------------+-------+
|100|[US, UK, AUS]|   true|
|101|     [US, UK]|   true|
|102|    [IND, UK]|  false|
|103|        [IND]|  false|
+---+-------------+-------+



In [87]:
data = [(100, "Paul", "Brandon"), (101, "John", "Doe"), (102, "Tina", "Nailor")]

df = spark.createDataFrame(data=data, schema=["id", "first_name", "last_name"])

df.show()

+---+----------+---------+
| id|first_name|last_name|
+---+----------+---------+
|100|      Paul|  Brandon|
|101|      John|      Doe|
|102|      Tina|   Nailor|
+---+----------+---------+



#### array

In [88]:
from pyspark.sql.functions import array

df = df.withColumn("full_name", array(df.first_name, df.last_name))
df.show()

+---+----------+---------+---------------+
| id|first_name|last_name|      full_name|
+---+----------+---------+---------------+
|100|      Paul|  Brandon|[Paul, Brandon]|
|101|      John|      Doe|    [John, Doe]|
|102|      Tina|   Nailor| [Tina, Nailor]|
+---+----------+---------+---------------+



### Map Functions
1. explode()
2. map_keys()
3. map_values()

In [94]:
data = [(1, {"name": "Paul", "gender": "male"}), (2, {"name": "Tina", "gender": "female"}), (3, {"name": "John", "gender": "male"})]

from pyspark.sql.types import StringType, IntegerType, MapType

schema = StructType([
    StructField(name="user_id", dataType=IntegerType()),
    StructField(name="users_info", dataType=MapType(keyType=StringType(), valueType=StringType()))
])

df = spark.createDataFrame(data=data, schema=schema)
df.show(truncate=False)

+-------+--------------------------------+
|user_id|users_info                      |
+-------+--------------------------------+
|1      |{gender -> male, name -> Paul}  |
|2      |{gender -> female, name -> Tina}|
|3      |{gender -> male, name -> John}  |
+-------+--------------------------------+



In [95]:
df = df.withColumn("user_gender", df.users_info.gender)
df.show(truncate=False)

+-------+--------------------------------+-----------+
|user_id|users_info                      |user_gender|
+-------+--------------------------------+-----------+
|1      |{gender -> male, name -> Paul}  |male       |
|2      |{gender -> female, name -> Tina}|female     |
|3      |{gender -> male, name -> John}  |male       |
+-------+--------------------------------+-----------+



In [98]:
df.select("user_id", "user_gender").show()

+-------+-----------+
|user_id|user_gender|
+-------+-----------+
|      1|       male|
|      2|     female|
|      3|       male|
+-------+-----------+



In [99]:
from pyspark.sql.functions import explode

df.select("user_id", explode(df.users_info)).show()

+-------+------+------+
|user_id|   key| value|
+-------+------+------+
|      1|gender|  male|
|      1|  name|  Paul|
|      2|gender|female|
|      2|  name|  Tina|
|      3|gender|  male|
|      3|  name|  John|
+-------+------+------+



In [100]:
df.show()

+-------+--------------------+-----------+
|user_id|          users_info|user_gender|
+-------+--------------------+-----------+
|      1|{gender -> male, ...|       male|
|      2|{gender -> female...|     female|
|      3|{gender -> male, ...|       male|
+-------+--------------------+-----------+



In [101]:
from pyspark.sql.functions import map_keys

df.withColumn("info_keys", map_keys(df.users_info)).show()

+-------+--------------------+-----------+--------------+
|user_id|          users_info|user_gender|     info_keys|
+-------+--------------------+-----------+--------------+
|      1|{gender -> male, ...|       male|[gender, name]|
|      2|{gender -> female...|     female|[gender, name]|
|      3|{gender -> male, ...|       male|[gender, name]|
+-------+--------------------+-----------+--------------+



In [102]:
from pyspark.sql.functions import map_values

df.withColumn("info_values", map_values(df.users_info)).show()

+-------+--------------------+-----------+--------------+
|user_id|          users_info|user_gender|   info_values|
+-------+--------------------+-----------+--------------+
|      1|{gender -> male, ...|       male|  [male, Paul]|
|      2|{gender -> female...|     female|[female, Tina]|
|      3|{gender -> male, ...|       male|  [male, John]|
+-------+--------------------+-----------+--------------+



In [None]:
#spark.stop()