### Custom schema defining using the StructType and StructField

Need for StructType and StructField :
1. For creating complex data structure
2. Enforcing the input data structure
3. For creating empty dataframe without schema
4. overide default config while importing data from file 
Eg: data type are infered from the imported data and nullable field are default to True 

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("learn").getOrCreate()

24/11/03 11:01:11 WARN Utils: Your hostname, padmanabhan-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/11/03 11:01:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/03 11:01:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# creating empty df using the StructType
from pyspark.sql.types import StructType,StructField
emptyDf = spark.createDataFrame([],StructType())
print(emptyDf.isEmpty())

                                                                                

True


In [5]:
# defining data structure for the data

from pyspark.sql.types import StringType,IntegerType

simpleData = [("padhu",1000),("karthir",5000)]

schema = StructType([
            StructField("name",StringType(),False),
            StructField('amount',IntegerType(),True)
])

simpleDf = spark.createDataFrame(simpleData,schema=schema)
simpleDf.printSchema()

root
 |-- name: string (nullable = false)
 |-- amount: integer (nullable = true)



In [4]:
# Schema for complex nested data structure
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
complexData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

complexSchema = StructType([
                StructField("name",StructType([
                    StructField("firstName",StringType(),True),
                    StructField("middleName",StringType(),True),
                    StructField("lastName",StringType(),True)
                ]),False),
                StructField("id",StringType(),True),
                StructField("gender",StringType(),True),
                StructField("salary",IntegerType(),True),
])

complexDF = spark.createDataFrame(complexData,complexSchema)
complexDF.printSchema()

root
 |-- name: struct (nullable = false)
 |    |-- firstName: string (nullable = true)
 |    |-- middleName: string (nullable = true)
 |    |-- lastName: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [9]:
# Adding and changing struct of the DF
from pyspark.sql.functions import col,struct,when

updatedComplexDF = complexDF.withColumn("OtherInfo",
                                        struct(col("id").alias("new id"),
                                               col("gender").alias("new gender"),
                                               col("salary").alias("new salary"),
                                               when(col("salary").cast(IntegerType())<2000,"low")
                                               .when(col("salary").cast(IntegerType())<4000,"medium")
                                               .otherwise("high)").alias("salary grade")
                                               )
                                        ).drop("id","salary","gender")

updatedComplexDF.printSchema()
updatedComplexDF.show(truncate=False)

root
 |-- name: struct (nullable = false)
 |    |-- firstName: string (nullable = true)
 |    |-- middleName: string (nullable = true)
 |    |-- lastName: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- new id: string (nullable = true)
 |    |-- new gender: string (nullable = true)
 |    |-- new salary: integer (nullable = true)
 |    |-- salary grade: string (nullable = false)

+--------------------+------------------------+
|name                |OtherInfo               |
+--------------------+------------------------+
|{James, , Smith}    |{36636, M, 3100, medium}|
|{Michael, Rose, }   |{40288, M, 4300, high)} |
|{Robert, , Williams}|{42114, M, 1400, low}   |
|{Maria, Anne, Jones}|{39192, F, 5500, high)} |
|{Jen, Mary, Brown}  |{, F, -1, low}          |
+--------------------+------------------------+



In [14]:
# ArrayType and Map Type 
from pyspark.sql.types import ArrayType,MapType,StringType,StructType,StructField

sampleData = [
    ("padmanabhan",["python","java"],{"hair":"black","company":"walmart"}),
    ("thilak",["c++","java"],{"hair":"brown","company":"walmart"})
]

schema = StructType([
            StructField("name",StringType(),False),
            StructField("languages",ArrayType(StringType()),False),
            StructField("identification",MapType(StringType(),StringType(),False))
])

sampleDF = spark.createDataFrame(sampleData,schema)
sampleDF.printSchema()
sampleDF.show(truncate=False)

root
 |-- name: string (nullable = false)
 |-- languages: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- identification: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = false)

+-----------+--------------+-----------------------------------+
|name       |languages     |identification                     |
+-----------+--------------+-----------------------------------+
|padmanabhan|[python, java]|{hair -> black, company -> walmart}|
|thilak     |[c++, java]   |{hair -> brown, company -> walmart}|
+-----------+--------------+-----------------------------------+



In [17]:
# Export Schema to Json

schema_data = sampleDF.schema.json()

In [22]:
with open("/home/padmanabhan/Desktop/development/data-engineering/pyspark/samples/schema_data_output.json","w+") as file:
    file.write(schema_data)

In [27]:
print(schema_data)

{"fields":[{"metadata":{},"name":"name","nullable":false,"type":"string"},{"metadata":{},"name":"languages","nullable":false,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"identification","nullable":true,"type":{"keyType":"string","type":"map","valueContainsNull":false,"valueType":"string"}}],"type":"struct"}


In [16]:
# Alternative way to get DF schema as simple string

sampleDF.schema.simpleString()

'struct<name:string,languages:array<string>,identification:map<string,string>>'

In [31]:
# Creating DF using the schema stored in the Json file
import json

with open("/home/padmanabhan/Desktop/development/data-engineering/pyspark/samples/schema_data_output.json", "r") as f:
    schema_json = json.load(f)

schemaFromJson  = StructType.fromJson(schema_json)

DFFromJson = spark.createDataFrame(sampleData,schema=schemaFromJson)

DFFromJson.printSchema()
DFFromJson.show(truncate=False)

root
 |-- name: string (nullable = false)
 |-- languages: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- identification: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = false)





+-----------+--------------+-----------------------------------+
|name       |languages     |identification                     |
+-----------+--------------+-----------------------------------+
|padmanabhan|[python, java]|{hair -> black, company -> walmart}|
|thilak     |[c++, java]   |{hair -> brown, company -> walmart}|
+-----------+--------------+-----------------------------------+



                                                                                

In [33]:
# Creating StrucType using the DDL

ddlSchemaStr = "`fullName` STRUCT<`first`: STRING, `last`: STRING,`middle`: STRING>,`age` INT,`gender` STRING"
ddlSchema = StructType.fromDDL(ddlSchemaStr)
ddlSchema.printTreeString()

AttributeError: type object 'StructType' has no attribute 'fromDDL'

In [45]:
# check for existence of a column in the DF

if "name" in sampleDF.columns:
    print("exist")
else:
    print("do not exist")

# alternatives to check for the existence of the column

print("name" in sampleDF.schema.fieldNames())
print(StructField("name",StringType(),False) in sampleDF.schema)

exist
True
True


True

True