In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as f
from pyspark.sql.functions import when

In [2]:
spark = SparkSession.builder.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
    .master("local[2]").appName('basic_spark').getOrCreate()
spark.sparkContext.setLogLevel('ERROR')

# create empty RDD
emptyRDD = spark.sparkContext.emptyRDD()
print(emptyRDD)

# create empty RDD using parallelize 
emptyRDD2 = spark.sparkContext.parallelize([])
print(emptyRDD2)


# create dataframe
schema = StructType([
    StructField("firstname", StringType(), True),
    StructField("midflename", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("age", IntegerType(), True)
])

df = spark.createDataFrame(emptyRDD, schema=schema)
print(df.printSchema())

# convert empty RDD to data frame
df1 = emptyRDD2.toDF(schema)
print(df1.printSchema())


# create empty dataframe
df2 = spark.createDataFrame([], schema)
print(df2.printSchema())


data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([
    StructField("firstname", StringType(), True),
    StructField("midflename", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])

df3 = spark.createDataFrame(data, schema)
print(df3.show(5, truncate=False))

# nested structure
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
    StructField('name', StructType([
        StructField("firstname", StringType(), True),
        StructField("midflename", StringType(), True),
        StructField("lastname", StringType(), True)
    ])),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)  
])

df4 = spark.createDataFrame(structureData, structureSchema)
df4.show(5, truncate=False)
df4.printSchema()

update_df4 = \
    df4.withColumn("OtherInfo",
        f.struct(
            f.col("id").alias("Identifier"),
            f.col("gender").alias("gender"),
            f.col("salary").alias("salary"),
            when(f.col("salary") < 2000, "Low")
            .when(f.col("salary") < 4000, "Medium")
            .otherwise("High").alias("salaryGrade")
        )
    
    )
update_df4.show(3, truncate= False)
print(update_df4.printSchema())


# nested structure
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
    StructField('name', StructType([
        StructField("firstname", StringType(), True),
        StructField("midflename", StringType(), True),
        StructField("lastname", StringType(), True)
    ])),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)  
])

df4 = spark.createDataFrame(structureData, structureSchema)
print(df4.show(5, truncate=False))
print(df4.printSchema())


update_df4 = \
    df4.withColumn("OtherInfo",
        f.struct(
            f.col("id").alias("Identifier"),
            f.col("gender").alias("gender"),
            f.col("salary").alias("salary"),
            when(f.col("salary") < 2000, "Low")
            .when(f.col("salary") < 4000, "Medium")
            .otherwise("High").alias("salaryGrade")
        )
    
    )
print(update_df4.show(3, truncate= False))
print(print(update_df4.printSchema()))

23/01/21 00:53:19 WARN Utils: Your hostname, Nileshs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.89 instead (on interface en0)
23/01/21 00:53:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/21 00:53:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/21 00:53:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


EmptyRDD[0] at emptyRDD at NativeMethodAccessorImpl.java:0
ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:274
root
 |-- firstname: string (nullable = true)
 |-- midflename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- age: integer (nullable = true)

None
root
 |-- firstname: string (nullable = true)
 |-- midflename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- age: integer (nullable = true)

None
root
 |-- firstname: string (nullable = true)
 |-- midflename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- age: integer (nullable = true)

None


                                                                                

+---------+----------+--------+-----+------+------+
|firstname|midflename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+

None
+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- midflename: string (nullable = true)
 |  

In [3]:
spark.stop()