In [0]:
"""
alias   filter/where    lit     add cols using withColumn    renaming cols using withColumnRenamed
cast    drop columns

NOTE: For each new transaformation stored in a df, Spark creates a new df as df is immutable 
USE DF API wisely or simply stick to spark.sql as both returns the same query plan in most of the cases(unless using udf)
"""

employee_df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .load("/FileStore/tables/employee_write11_data.csv")

employee_df.show()
employee_df.createOrReplaceTempView("employee_tbl")

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  1|  Manish| 26| 75000|  INDIA|     m|
|  2|  Nikita| 23|100000|    USA|     f|
|  3|  Pritam| 22|150000|  INDIA|     m|
|  4|Prantosh| 17|200000|  JAPAN|     m|
|  5|  Vikash| 31|300000|    USA|     m|
|  6|   Rahul| 55|300000|  INDIA|     m|
|  7|    Raju| 67|540000|    USA|     m|
|  8| Praveen| 28| 70000|  JAPAN|     m|
|  9|     Dev| 32|150000|  JAPAN|     m|
| 10|  Sherin| 16| 25000| RUSSIA|     f|
| 11|    Ragu| 12| 35000|  INDIA|     f|
| 12|   Sweta| 43|200000|  INDIA|     f|
| 13| Raushan| 48|650000|    USA|     m|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|
| 15| Prakash| 52|750000|  INDIA|     m|
+---+--------+---+------+-------+------+



In [0]:
from pyspark.sql.functions import *

employee_df.select(col("id").alias("employee_id"), "age").show(5)

+-----------+---+
|employee_id|age|
+-----------+---+
|          1| 26|
|          2| 23|
|          3| 22|
|          4| 17|
|          5| 31|
+-----------+---+
only showing top 5 rows



In [0]:
employee_df.filter((col("salary") > 150000) & (col("age") < 18)).show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  4|Prantosh| 17|200000|  JAPAN|     m|
+---+--------+---+------+-------+------+



In [0]:
employee_df.select("*", lit("kumar").alias("last_name")).show()

+---+--------+---+------+-------+------+---------+
| id|    name|age|salary|address|gender|last_name|
+---+--------+---+------+-------+------+---------+
|  1|  Manish| 26| 75000|  INDIA|     m|    kumar|
|  2|  Nikita| 23|100000|    USA|     f|    kumar|
|  3|  Pritam| 22|150000|  INDIA|     m|    kumar|
|  4|Prantosh| 17|200000|  JAPAN|     m|    kumar|
|  5|  Vikash| 31|300000|    USA|     m|    kumar|
|  6|   Rahul| 55|300000|  INDIA|     m|    kumar|
|  7|    Raju| 67|540000|    USA|     m|    kumar|
|  8| Praveen| 28| 70000|  JAPAN|     m|    kumar|
|  9|     Dev| 32|150000|  JAPAN|     m|    kumar|
| 10|  Sherin| 16| 25000| RUSSIA|     f|    kumar|
| 11|    Ragu| 12| 35000|  INDIA|     f|    kumar|
| 12|   Sweta| 43|200000|  INDIA|     f|    kumar|
| 13| Raushan| 48|650000|    USA|     m|    kumar|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|    kumar|
| 15| Prakash| 52|750000|  INDIA|     m|    kumar|
+---+--------+---+------+-------+------+---------+



In [0]:
# withColumn equivalent of employee_df.select("*", lit("kumar").alias("last_name")).show()

employee_df.withColumn("last_name", lit("singh")).show()

+---+--------+---+------+-------+------+---------+
| id|    name|age|salary|address|gender|last_name|
+---+--------+---+------+-------+------+---------+
|  1|  Manish| 26| 75000|  INDIA|     m|    singh|
|  2|  Nikita| 23|100000|    USA|     f|    singh|
|  3|  Pritam| 22|150000|  INDIA|     m|    singh|
|  4|Prantosh| 17|200000|  JAPAN|     m|    singh|
|  5|  Vikash| 31|300000|    USA|     m|    singh|
|  6|   Rahul| 55|300000|  INDIA|     m|    singh|
|  7|    Raju| 67|540000|    USA|     m|    singh|
|  8| Praveen| 28| 70000|  JAPAN|     m|    singh|
|  9|     Dev| 32|150000|  JAPAN|     m|    singh|
| 10|  Sherin| 16| 25000| RUSSIA|     f|    singh|
| 11|    Ragu| 12| 35000|  INDIA|     f|    singh|
| 12|   Sweta| 43|200000|  INDIA|     f|    singh|
| 13| Raushan| 48|650000|    USA|     m|    singh|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|    singh|
| 15| Prakash| 52|750000|  INDIA|     m|    singh|
+---+--------+---+------+-------+------+---------+



In [0]:
new_employee_df = employee_df.withColumnRenamed("id", "employee_id")
new_employee_df.show()

+-----------+--------+---+------+-------+------+
|employee_id|    name|age|salary|address|gender|
+-----------+--------+---+------+-------+------+
|          1|  Manish| 26| 75000|  INDIA|     m|
|          2|  Nikita| 23|100000|    USA|     f|
|          3|  Pritam| 22|150000|  INDIA|     m|
|          4|Prantosh| 17|200000|  JAPAN|     m|
|          5|  Vikash| 31|300000|    USA|     m|
|          6|   Rahul| 55|300000|  INDIA|     m|
|          7|    Raju| 67|540000|    USA|     m|
|          8| Praveen| 28| 70000|  JAPAN|     m|
|          9|     Dev| 32|150000|  JAPAN|     m|
|         10|  Sherin| 16| 25000| RUSSIA|     f|
|         11|    Ragu| 12| 35000|  INDIA|     f|
|         12|   Sweta| 43|200000|  INDIA|     f|
|         13| Raushan| 48|650000|    USA|     m|
|         14|  Mukesh| 36| 95000| RUSSIA|     m|
|         15| Prakash| 52|750000|  INDIA|     m|
+-----------+--------+---+------+-------+------+



In [0]:
employee_df.withColumn("id", col("id").cast("string"))\
    .withColumn("salary", col("salary").cast("long"))\
    .printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
employee_df.drop("id", col("name")).show(5)

+---+------+-------+------+
|age|salary|address|gender|
+---+------+-------+------+
| 26| 75000|  INDIA|     m|
| 23|100000|    USA|     f|
| 22|150000|  INDIA|     m|
| 17|200000|  JAPAN|     m|
| 31|300000|    USA|     m|
+---+------+-------+------+
only showing top 5 rows

