### Standard Schema Definition

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
data1 = [
    ('James', '', 'Smith', "36636", "M", 3000),
    ('Michael', 'Rose', '', "40288", "M", 4000),
    ('Robert', '', 'Williams', "42114", "M", 4000),
    ('Maria', 'Anne', 'Jones', "39192", "F", 4000),
    ('Jen', 'Mary', 'Brown', "", "F", -1)
]

schema1 = StructType([
    StructField('firstname', StringType(), False),
    StructField('middlename', StringType(), True),
    StructField('lastname', StringType(), True),
    StructField('id', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])
df = spark.createDataFrame(data=data1, schema=schema1)
df.printSchema()
df.show()
df.display()

### Nested Schema Definition

In [0]:
data2 = [
    (('James', '', 'Smith'), "36636", "M", 3000),
    (('Michael', 'Rose', ''), "40288", "M", 4000),
    (('Robert', '', 'Williams'), "42114", "M", 4000),
    (('Maria', 'Anne', 'Jones'), "39192", "F", 4000),
    (('Jen', 'Mary', 'Brown'), "", "F", -1)
]

schema2 = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), False),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('id', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])
df2 = spark.createDataFrame(data=data2, schema=schema2)
df2.printSchema()
df2.show()
df2.display()

### ArrayType and MapType

In [0]:
data3 = [
    (('James', '', 'Smith'), ['musing', 'reading', 'chess'], {'salary': '3000', 'dept': 'HR'}),
    (('Michael', 'Rose', ''), ['musing', 'playing', 'chess'], {'salary': '3000', 'dept': 'HR'}),
    (('Robert', '', 'Williams'), ['musing', 'reading', 'chess'], {'salary': '3000', 'dept': 'HR'}),
    (('Maria', 'Anne', 'Jones'), ['musing', 'reading', 'chess'], {'salary': '3000', 'dept': 'HR'}),
    (('Jen', 'Mary', 'Brown'), ['musing', 'reading', 'chess'], {'salary': '3000', 'dept': 'HR'})
]

schema3 = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), False),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('hobbies', ArrayType(StringType()), True),
    StructField('properties', MapType(StringType(), StringType()), True),
])
df3 = spark.createDataFrame(data=data3, schema=schema3)
df3.printSchema()
df3.show()
df3.display()

### Schema Definition for Reading Big Data Files

In [0]:
%fs
head /Volumes/workspace/default/mandy/manufacturers.csv

In [0]:
mySchema = StructType([
    StructField('Manufacturers', StringType(), False),
    StructField('Country', StringType(), True)
])

df = spark.read.format("csv").option('header', True).schema(mySchema).load("/Volumes/workspace/default/mandy/manufacturers.csv")
df.display()

### Second Method of Createing Schema

In [0]:
schemaInline = 'Manufacturers STRING, Country STRING'
df = spark.read.format("csv").option('header', True).schema(schemaInline).load("/Volumes/workspace/default/mandy/manufacturers.csv")
df.display()

### Third Method of Creating Schema

In [0]:
data = [
    ('Mike', 'M', 50000, 2),
    ('Nancy', 'F', 45000, 3),
    ('Thomas', 'M', 47000, 2),
    ('William', 'M', 40000, 4),
    ('Rosy', 'F', 35000, 5),
]

schema = ['EmployeeName', 'Gender', 'Salary', 'YOE']

df = spark.createDataFrame(data, schema)
df.display()

### Create Schema Along with Dataframe

In [0]:
df = spark.createDataFrame([
    ('Mazda RX4', 21, 4, 4),
    ('Hornet 4 Drive', 22, 3, 2),
    ('Merc 240D', 25, 4, 2),
    ('Lotus Europa', 31, 5, 2),
    ('Ferrari Dino', 20, 5, 6),
    ('Volvo 142 E', 22, 4, 2)
], ['CarName', 'MGP', 'Gear', 'Carb'])

df.display()

### Schema Functions

In [0]:
df.printSchema()

In [0]:
print(df.schema)

In [0]:
print(df.schema.json())