In [None]:
from pyspark.sql import SparkSession

In [1]:
spark = SparkSession.builder \
.appName("Schema Enforcement in Spark") \
.getOrCreate()

25/02/03 13:11:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
df = spark.read \
.format('csv')\
.option('header',"true")\
.option('inferSchema','true')\
.load('/data/customers_500mb.csv')

#  ---> will be scanning the whole and can also lead to wrong inference , not consistency

# Struct Type

In [5]:
!hadoop fs -head /data/customers_100.csv

customer_id,name,city,state,country,registration_date,is_active
0,Customer_0,Pune,Maharashtra,India,2023-06-29,False
1,Customer_1,Bangalore,Tamil Nadu,India,2023-12-07,True
2,Customer_2,Hyderabad,Gujarat,India,2023-10-27,True
3,Customer_3,Bangalore,Karnataka,India,2023-10-17,False
4,Customer_4,Ahmedabad,Karnataka,India,2023-03-14,False
5,Customer_5,Hyderabad,Karnataka,India,2023-07-28,False
6,Customer_6,Pune,Delhi,India,2023-08-29,False
7,Customer_7,Ahmedabad,West Bengal,India,2023-12-28,True
8,Customer_8,Pune,Karnataka,India,2023-06-22,True
9,Customer_9,Mumbai,Telangana,India,2023-01-05,True
10,Customer_10,Pune,Gujarat,India,2023-08-05,True
11,Customer_11,Delhi,West Bengal,India,2023-08-02,False
12,Customer_12,Chennai,Gujarat,India,2023-11-21,False
13,Customer_13,Chennai,Karnataka,India,2023-11-06,True
14,Customer_14,Hyderabad,Tamil Nadu,India,2023-02-07,False
15,Customer_15,Mumbai,Gujarat,India,2023-03-02,True
16,Customer_16,Chennai,Karnataka,India,2023-04-05,False
17,Customer_17,Hyd

In [4]:
from pyspark.sql.types import StructType,StructField, IntegerType, FloatType, BooleanType, StringType

In [15]:
schema = StructType([
    StructField('customer_id',IntegerType(),True),
    StructField('name',StringType(),True),
    StructField('city',StringType(),True),
    StructField('state',StringType(),True),
    StructField('country',StringType(),True),
    StructField('registration_date',StringType(),True),
    StructField('is_active',BooleanType(),True),
])

In [13]:
df = spark.read \
.format('csv')\
.option('header',"true")\
.schema(schema)\
.load('/data/customers_100.csv')

In [10]:
df.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

In [11]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: boolean (nullable = true)



# DDL Schema

In [26]:
ddl_schema = 'customer_id INT, name STRING, city STRING,state STRING, country STRING, registration_date STRING, is_active BOOLEAN'

In [27]:
df_ddl = spark.read \
.format('csv')\
.option('header',"true")\
.schema(ddl_schema)\
.load('/data/customers_100.csv')

In [28]:
df_ddl.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)



In [29]:
df_ddl.show()

+-----------+-----------+---------+-----------+-------+-----------------+
|customer_id|       name|     city|      state|country|registration_date|
+-----------+-----------+---------+-----------+-------+-----------------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|
|          9| Customer_9|   Mumbai|  Telangana|  India|       2023-01-05|
|         10|Customer_10|     Pune|   