In [54]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('App').getOrCreate()

data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df = spark.createDataFrame(data=data, schema = columns)
df.show(truncate=False)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+



In [4]:
from pyspark.sql.types import IntegerType,BooleanType,DateType
from pyspark.sql.functions import col

df = df.withColumn("salary", df["salary"].cast(IntegerType()))
df = df.withColumn("dob",df.dob.cast(DateType()))
df = df.withColumn("gender",df.gender.cast(BooleanType()))

df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- gender: boolean (nullable = true)
 |-- salary: integer (nullable = true)



In [5]:
df.withColumn("salary",col("salary")*100).show()
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|  null|300000|
|  Michael|      Rose|        |2000-05-19|  null|400000|
|   Robert|          |Williams|1978-09-05|  null|400000|
|    Maria|      Anne|   Jones|1967-12-01| false|400000|
|      Jen|      Mary|   Brown|1980-02-17| false|  -100|
+---------+----------+--------+----------+------+------+

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|  null|  3000|
|  Michael|      Rose|        |2000-05-19|  null|  4000|
|   Robert|          |Williams|1978-09-05|  null|  4000|
|    Maria|      Anne|   Jones|1967-12-01| false|  4000|
|      Jen|      Mary|   Brown|1980-02-17| false|    -1|
+---------+----------+--------

In [11]:
from pyspark.sql.functions import when, col

df = df.withColumn("salary", when((col("firstname") == "Jen"), 1000).otherwise(col("salary")))
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|  null|  3000|
|  Michael|      Rose|        |2000-05-19|  null|  4000|
|   Robert|          |Williams|1978-09-05|  null|  4000|
|    Maria|      Anne|   Jones|1967-12-01| false|  4000|
|      Jen|      Mary|   Brown|1980-02-17| false|  1000|
+---------+----------+--------+----------+------+------+



In [13]:
df = df.withColumn("salary_10", col("salary") + 1000)
df.show()

+---------+----------+--------+----------+------+------+---------+
|firstname|middlename|lastname|       dob|gender|salary|salary_10|
+---------+----------+--------+----------+------+------+---------+
|    James|          |   Smith|1991-04-01|  null|  3000|     4000|
|  Michael|      Rose|        |2000-05-19|  null|  4000|     5000|
|   Robert|          |Williams|1978-09-05|  null|  4000|     5000|
|    Maria|      Anne|   Jones|1967-12-01| false|  4000|     5000|
|      Jen|      Mary|   Brown|1980-02-17| false|  1000|     2000|
+---------+----------+--------+----------+------+------+---------+



In [30]:
from pyspark.sql.functions import lit

df.withColumn("ID", lit(3))
df.show()

+---------+----------+--------+----------+------+------+---------+---+
|firstname|middlename|lastname|       dob|gender|salary|salary_10| ID|
+---------+----------+--------+----------+------+------+---------+---+
|    James|          |   Smith|1991-04-01|  null|  3000|     4000|  5|
|  Michael|      Rose|        |2000-05-19|  null|  4000|     5000|  5|
|   Robert|          |Williams|1978-09-05|  null|  4000|     5000|  5|
|    Maria|      Anne|   Jones|1967-12-01| false|  4000|     5000|  5|
|      Jen|      Mary|   Brown|1980-02-17| false|  1000|     2000|  5|
+---------+----------+--------+----------+------+------+---------+---+



In [34]:
from pyspark.sql.functions import col, lit

df = df.withColumn("ID", lit(0))  
for i in range(1, 6):
    df = df.withColumn("ID", (col("ID") + lit(1)))
df1.show()

+---------+----------+--------+----------+------+------+---------+---+
|firstname|middlename|lastname|       dob|gender|salary|salary_10| ID|
+---------+----------+--------+----------+------+------+---------+---+
|    James|          |   Smith|1991-04-01|  null|  3000|     4000|  5|
|  Michael|      Rose|        |2000-05-19|  null|  4000|     5000|  5|
|   Robert|          |Williams|1978-09-05|  null|  4000|     5000|  5|
|    Maria|      Anne|   Jones|1967-12-01| false|  4000|     5000|  5|
|      Jen|      Mary|   Brown|1980-02-17| false|  1000|     2000|  5|
+---------+----------+--------+----------+------+------+---------+---+



In [36]:
df = df.withColumnRenamed("salary_10","hike_Salary")
df.show()

+---------+----------+--------+----------+------+------+-----------+---+
|firstname|middlename|lastname|       dob|gender|salary|hike_Salary| ID|
+---------+----------+--------+----------+------+------+-----------+---+
|    James|          |   Smith|1991-04-01|  null|  3000|       4000|  5|
|  Michael|      Rose|        |2000-05-19|  null|  4000|       5000|  5|
|   Robert|          |Williams|1978-09-05|  null|  4000|       5000|  5|
|    Maria|      Anne|   Jones|1967-12-01| false|  4000|       5000|  5|
|      Jen|      Mary|   Brown|1980-02-17| false|  1000|       2000|  5|
+---------+----------+--------+----------+------+------+-----------+---+



In [37]:
df.drop("hike_salary").show() 

+---------+----------+--------+----------+------+------+---+
|firstname|middlename|lastname|       dob|gender|salary| ID|
+---------+----------+--------+----------+------+------+---+
|    James|          |   Smith|1991-04-01|  null|  3000|  5|
|  Michael|      Rose|        |2000-05-19|  null|  4000|  5|
|   Robert|          |Williams|1978-09-05|  null|  4000|  5|
|    Maria|      Anne|   Jones|1967-12-01| false|  4000|  5|
|      Jen|      Mary|   Brown|1980-02-17| false|  1000|  5|
+---------+----------+--------+----------+------+------+---+



In [55]:
df.select(col("firstname").alias("fname"), \
  col("middlename").alias("mname"), \
  col("lastname").alias("lname"), \
  col("dob"),col("gender"),col("salary")) \
  .show()

+-------+-----+--------+----------+------+------+
|  fname|mname|   lname|       dob|gender|salary|
+-------+-----+--------+----------+------+------+
|  James|     |   Smith|1991-04-01|     M|  3000|
|Michael| Rose|        |2000-05-19|     M|  4000|
| Robert|     |Williams|1978-09-05|     M|  4000|
|  Maria| Anne|   Jones|1967-12-01|     F|  4000|
|    Jen| Mary|   Brown|1980-02-17|     F|    -1|
+-------+-----+--------+----------+------+------+



In [60]:
df4 = df.withColumn("fname",col("name.firstname"),\
      .withColumn("mname",col("name.middlename"), \
      .withColumn("lname",col("name.lastname"))
df4.show()

SyntaxError: invalid syntax (3057354751.py, line 2)