In [0]:
data = [('1','James','','Smith','1991-04-01','M',3000),
  ('2','Michael','Rose','','2000-05-19','M',4000),
  ('3','Robert','','Williams','1978-09-05','M',4000),
  ('4','Maria','Anne','Jones','1967-12-01','F',4000),
  ('5','Jen','Mary','Brown','1980-02-17','F',5000)
]

columns = ["id","firstname","middlename","lastname","dob","gender","salary"]
users= spark.createDataFrame(data=data,schema=columns)
users.show()

+---+---------+----------+--------+----------+------+------+
| id|firstname|middlename|lastname|       dob|gender|salary|
+---+---------+----------+--------+----------+------+------+
|  1|    James|          |   Smith|1991-04-01|     M|  3000|
|  2|  Michael|      Rose|        |2000-05-19|     M|  4000|
|  3|   Robert|          |Williams|1978-09-05|     M|  4000|
|  4|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|  5|      Jen|      Mary|   Brown|1980-02-17|     F|  5000|
+---+---------+----------+--------+----------+------+------+



In [0]:
# Deriving fullname column by concatinating firstname,middlename,lastname 
from pyspark.sql.functions import concat, lit
# users.select('firstname','middlename','lastname').\
users.select('*').\
withColumn('fullname',concat('firstname',lit(','),'middlename','lastname')).\
show()

+---+---------+----------+--------+----------+------+------+---------------+
| id|firstname|middlename|lastname|       dob|gender|salary|       fullname|
+---+---------+----------+--------+----------+------+------+---------------+
|  1|    James|          |   Smith|1991-04-01|     M|  3000|    James,Smith|
|  2|  Michael|      Rose|        |2000-05-19|     M|  4000|   Michael,Rose|
|  3|   Robert|          |Williams|1978-09-05|     M|  4000|Robert,Williams|
|  4|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|Maria,AnneJones|
|  5|      Jen|      Mary|   Brown|1980-02-17|     F|  5000|  Jen,MaryBrown|
+---+---------+----------+--------+----------+------+------+---------------+



In [0]:
users.dtypes

Out[42]: [('id', 'string'),
 ('firstname', 'string'),
 ('middlename', 'string'),
 ('lastname', 'string'),
 ('dob', 'string'),
 ('gender', 'string'),
 ('salary', 'bigint')]

In [0]:
# chnaging datatype of a column
from pyspark.sql.functions import col
users.withColumn('salary',col('salary').cast('integer')).\
withColumn('id',col('id').cast('integer')).dtypes

Out[45]: [('id', 'int'),
 ('firstname', 'string'),
 ('middlename', 'string'),
 ('lastname', 'string'),
 ('dob', 'string'),
 ('gender', 'string'),
 ('salary', 'int')]

In [0]:
# Updating The Value of an Existing Column
users.withColumn("salary",col("salary")*100).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|300000|
|  Michael|      Rose|        |2000-05-19|     M|400000|
|   Robert|          |Williams|1978-09-05|     M|400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|
|      Jen|      Mary|   Brown|1980-02-17|     F|500000|
+---------+----------+--------+----------+------+------+



In [0]:
# adding newcolumn
from pyspark.sql.functions import *
users.withColumn('country',lit('India')).show()

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|country|
+---------+----------+--------+----------+------+------+-------+
|    James|          |   Smith|1991-04-01|     M|  3000|  India|
|  Michael|      Rose|        |2000-05-19|     M|  4000|  India|
|   Robert|          |Williams|1978-09-05|     M|  4000|  India|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|  India|
|      Jen|      Mary|   Brown|1980-02-17|     F|  5000|  India|
+---------+----------+--------+----------+------+------+-------+



In [0]:
users.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  5000|
+---------+----------+--------+----------+------+------+



In [0]:
#changing columnnames using withColumnRenamed function
users.select('*').show()
users.withColumnRenamed('id','user_id').show()

+---+---------+----------+--------+----------+------+------+
| id|firstname|middlename|lastname|       dob|gender|salary|
+---+---------+----------+--------+----------+------+------+
|  1|    James|          |   Smith|1991-04-01|     M|  3000|
|  2|  Michael|      Rose|        |2000-05-19|     M|  4000|
|  3|   Robert|          |Williams|1978-09-05|     M|  4000|
|  4|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|  5|      Jen|      Mary|   Brown|1980-02-17|     F|  5000|
+---+---------+----------+--------+----------+------+------+

+-------+---------+----------+--------+----------+------+------+
|user_id|firstname|middlename|lastname|       dob|gender|salary|
+-------+---------+----------+--------+----------+------+------+
|      1|    James|          |   Smith|1991-04-01|     M|  3000|
|      2|  Michael|      Rose|        |2000-05-19|     M|  4000|
|      3|   Robert|          |Williams|1978-09-05|     M|  4000|
|      4|    Maria|      Anne|   Jones|1967-12-01|     F|  4

In [0]:
# Aliasing column names
users.select('*').show()
users.select(users['id'].alias('user_id'),
            col('dob').alias('DOB')).show()

+---+---------+----------+--------+----------+------+------+
| id|firstname|middlename|lastname|       dob|gender|salary|
+---+---------+----------+--------+----------+------+------+
|  1|    James|          |   Smith|1991-04-01|     M|  3000|
|  2|  Michael|      Rose|        |2000-05-19|     M|  4000|
|  3|   Robert|          |Williams|1978-09-05|     M|  4000|
|  4|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|  5|      Jen|      Mary|   Brown|1980-02-17|     F|  5000|
+---+---------+----------+--------+----------+------+------+

+-------+----------+
|user_id|       DOB|
+-------+----------+
|      1|1991-04-01|
|      2|2000-05-19|
|      3|1978-09-05|
|      4|1967-12-01|
|      5|1980-02-17|
+-------+----------+



In [0]:
new_columns =['user_id','user_first_name','user_middle_name','user_last_name','user_dob','user_gender','user_salary']

In [0]:
#changing column names using toDF
users.select('*').toDF(*new_columns).show()

+-------+---------------+----------------+--------------+----------+-----------+-----------+
|user_id|user_first_name|user_middle_name|user_last_name|  user_dob|user_gender|user_salary|
+-------+---------------+----------------+--------------+----------+-----------+-----------+
|      1|          James|                |         Smith|1991-04-01|          M|       3000|
|      2|        Michael|            Rose|              |2000-05-19|          M|       4000|
|      3|         Robert|                |      Williams|1978-09-05|          M|       4000|
|      4|          Maria|            Anne|         Jones|1967-12-01|          F|       4000|
|      5|            Jen|            Mary|         Brown|1980-02-17|          F|       5000|
+-------+---------------+----------------+--------------+----------+-----------+-----------+

