In [4]:
import findspark
findspark.init('c:/Spark')
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [5]:
spark = SparkSession.builder.appName("withColumnLearning").getOrCreate()


In [6]:
data = [(("James","","Smith"),"36636","M","3000"), \
      (("Michael","Rose",""),"40288","M","4000"), \
      (("Robert","","Williams"),"42114","M","4000"), \
      (("Maria","Anne","Jones"),"39192","F","4000"), \
      (("Jen","Mary","Brown"),"","F","-1") \
]

In [7]:
schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
          StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', StringType(), True)
         ])

In [9]:
df = spark.createDataFrame(data, schema)
df.show()

+--------------------+-----+------+------+
|                name|  dob|gender|salary|
+--------------------+-----+------+------+
|    [James, , Smith]|36636|     M|  3000|
|   [Michael, Rose, ]|40288|     M|  4000|
|[Robert, , Williams]|42114|     M|  4000|
|[Maria, Anne, Jones]|39192|     F|  4000|
|  [Jen, Mary, Brown]|     |     F|    -1|
+--------------------+-----+------+------+



In [17]:
# Using withColumn to change data type - Cast Salary to an integer
df2 = df.withColumn('salary', col('salary').cast('Integer'))
df2.printSchema()
df2.show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|                name|  dob|gender|salary|
+--------------------+-----+------+------+
|    [James, , Smith]|36636|     M|  3000|
|   [Michael, Rose, ]|40288|     M|  4000|
|[Robert, , Williams]|42114|     M|  4000|
|[Maria, Anne, Jones]|39192|     F|  4000|
|  [Jen, Mary, Brown]|     |     F|    -1|
+--------------------+-----+------+------+



In [19]:
# Change the value of an existing column
df3 = df2.withColumn('salary',col('salary')*100)
df3.show()

+--------------------+-----+------+------+
|                name|  dob|gender|salary|
+--------------------+-----+------+------+
|    [James, , Smith]|36636|     M|300000|
|   [Michael, Rose, ]|40288|     M|400000|
|[Robert, , Williams]|42114|     M|400000|
|[Maria, Anne, Jones]|39192|     F|400000|
|  [Jen, Mary, Brown]|     |     F|  -100|
+--------------------+-----+------+------+



In [36]:
# Derive a new column from an existing column
df4 = df3.withColumn('monthly_salary',col('salary')*1/12)
df4.withColumn('monthly_salary', col('monthly_salary').cast('String'))  # For funs
df4.show()

+--------------------+-----+------+------+------------------+
|                name|  dob|gender|salary|    monthly_salary|
+--------------------+-----+------+------+------------------+
|    [James, , Smith]|36636|     M|300000|           25000.0|
|   [Michael, Rose, ]|40288|     M|400000|33333.333333333336|
|[Robert, , Williams]|42114|     M|400000|33333.333333333336|
|[Maria, Anne, Jones]|39192|     F|400000|33333.333333333336|
|  [Jen, Mary, Brown]|     |     F|  -100|-8.333333333333334|
+--------------------+-----+------+------+------------------+



In [42]:
# Add a new column, heck add a 2nd one while your at it.
df5 = df4.withColumn('country', lit('USA')) \
    .withColumn('another_column',lit('anotherValue'))
df5.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- monthly_salary: double (nullable = true)
 |-- country: string (nullable = false)
 |-- another_column: string (nullable = false)



In [59]:
# Rename DataFrame column name
df6 = df5.withColumnRenamed('another_column','department')

In [60]:
# Drop a column form the dataframe

df6.drop("salary") \
.show(truncate=False) 

+--------------------+-----+------+------------------+-------+------------+
|name                |dob  |gender|monthly_salary    |country|department  |
+--------------------+-----+------+------------------+-------+------------+
|[James, , Smith]    |36636|M     |25000.0           |USA    |anotherValue|
|[Michael, Rose, ]   |40288|M     |33333.333333333336|USA    |anotherValue|
|[Robert, , Williams]|42114|M     |33333.333333333336|USA    |anotherValue|
|[Maria, Anne, Jones]|39192|F     |33333.333333333336|USA    |anotherValue|
|[Jen, Mary, Brown]  |     |F     |-8.333333333333334|USA    |anotherValue|
+--------------------+-----+------+------------------+-------+------------+



In [62]:
df5.show()

+--------------------+-----+------+------+------------------+-------+--------------+
|                name|  dob|gender|salary|    monthly_salary|country|another_column|
+--------------------+-----+------+------+------------------+-------+--------------+
|    [James, , Smith]|36636|     M|300000|           25000.0|    USA|  anotherValue|
|   [Michael, Rose, ]|40288|     M|400000|33333.333333333336|    USA|  anotherValue|
|[Robert, , Williams]|42114|     M|400000|33333.333333333336|    USA|  anotherValue|
|[Maria, Anne, Jones]|39192|     F|400000|33333.333333333336|    USA|  anotherValue|
|  [Jen, Mary, Brown]|     |     F|  -100|-8.333333333333334|    USA|  anotherValue|
+--------------------+-----+------+------+------------------+-------+--------------+

