In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, DateType, StringType, TimestampType

In [2]:
spark = SparkSession.builder. \
appName("Create Schema Manually"). \
getOrCreate()

### InferSchema

In [3]:
df_withSchema = spark.read.csv("./dataset/orders_wh.csv", header=True, inferSchema=True)
df_withSchema.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### InferSchema + samplingRatio

In [4]:
df_withSchema.count()

68883

In [5]:
df_withSchemaSampling = spark.read.csv("./dataset/orders_wh.csv", header=True, inferSchema=True, samplingRatio=0.1)
df_withSchemaSampling.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### Create Schema Manually

In [6]:
df_skipSchema = spark.read.csv("./dataset/orders_wh.csv", header=True)
df_skipSchema.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)



In [7]:
schema_text = "order_id long, order_date timestamp, customer_id long, order_status string"

In [9]:
df_1 = spark.read.csv("./dataset/orders_wh.csv", header=True, schema=schema_text)
df_1.printSchema()


root
 |-- order_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [8]:
schema_struct = StructType([
    StructField("order_id", LongType()),
    StructField("order_date", TimestampType()),
    StructField("customer_id", LongType()),
    StructField("order_status", StringType())
])

In [10]:
df_2 = spark.read.csv("./dataset/orders_wh.csv", header=True, schema=schema_struct)
df_2.printSchema()


root
 |-- order_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)

