In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder.appName("read")
         .master("local[*]")
         .config("spark.executor.memory", "1g")
         .getOrCreate()
         )

# StrucType Schema

In [6]:
from pyspark.sql.types import *

In [7]:
schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("registration_date", DateType(), True),
    StructField("is_active", BooleanType(), True)
])

In [8]:
df = (spark.read.format('csv')
      .option('header', 'true')
      .schema(schema)
      .load('../Data/customers.csv'))

In [9]:
df.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|West Bengal|  India|       2023-10-10|     true|
|          1| Customer_1|Bangalore|    Gujarat|  India|       2023-10-19|    false|
|          2| Customer_2|Bangalore|  Karnataka|  India|       2023-02-10|     true|
|          3| Customer_3|Bangalore|  Telangana|  India|       2023-03-24|     true|
|          4| Customer_4|Hyderabad|  Telangana|  India|       2023-06-04|    false|
|          5| Customer_5|Hyderabad|West Bengal|  India|       2023-07-26|     true|
|          6| Customer_6|Hyderabad|  Karnataka|  India|       2023-08-07|    false|
|          7| Customer_7|Bangalore|  Telangana|  India|       2023-08-25|     true|
|          8| Customer_8|Bangalore|Maharashtra|  India|       2023-07-13|   

In [10]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



# DDL Schema

In [11]:
ddl_schema = 'customer_id INT, name STRING, city STRING, state STRING, country STRING, registration_date STRING, is_active BOOLEAN'

In [12]:
df_ddl = (spark.read.format('csv')
          .option('header', 'true')
          .schema(ddl_schema)
          .load('../Data/customers.csv'))

In [13]:
df_ddl.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: boolean (nullable = true)



In [14]:
df_ddl.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|West Bengal|  India|       2023-10-10|     true|
|          1| Customer_1|Bangalore|    Gujarat|  India|       2023-10-19|    false|
|          2| Customer_2|Bangalore|  Karnataka|  India|       2023-02-10|     true|
|          3| Customer_3|Bangalore|  Telangana|  India|       2023-03-24|     true|
|          4| Customer_4|Hyderabad|  Telangana|  India|       2023-06-04|    false|
|          5| Customer_5|Hyderabad|West Bengal|  India|       2023-07-26|     true|
|          6| Customer_6|Hyderabad|  Karnataka|  India|       2023-08-07|    false|
|          7| Customer_7|Bangalore|  Telangana|  India|       2023-08-25|     true|
|          8| Customer_8|Bangalore|Maharashtra|  India|       2023-07-13|   