## 1. Pyspark

In [0]:
# Delta Lake supports creating two types of tables—tables defined in the metastore and tables defined by path.
# we can use 3 method :
# 1.    create
# 2.    createIfNotExists
# 3.    createOrReplace

In [0]:
from delta.tables import * 
# Create table in the metastore
DeltaTable.createIfNotExists(spark) \
  .tableName("default.people") \
  .addColumn("id", "INT") \
  .addColumn("firstName", "STRING") \
  .addColumn("middleName", "STRING") \
  .addColumn("lastName", "STRING", comment = "surname") \
  .addColumn("gender", "STRING") \
  .addColumn("birthDate", "TIMESTAMP") \
  .addColumn("ssn", "STRING") \
  .addColumn("salary", "INT") \
  .execute()

Out[3]: <delta.tables.DeltaTable at 0x7f821aea4280>

In [0]:
%sql 
select * from people

id,firstName,middleName,lastName,gender,birthDate,ssn,salary


In [0]:
# Create or replace table with path and add properties
DeltaTable.createOrReplace(spark) \
  .addColumn("id", "INT") \
  .addColumn("firstName", "STRING") \
  .addColumn("middleName", "STRING") \
  .addColumn("lastName", "STRING", comment = "surname") \
  .addColumn("gender", "STRING") \
  .addColumn("birthDate", "TIMESTAMP") \
  .addColumn("ssn", "STRING") \
  .addColumn("salary", "INT") \
  .property("description", "table with people data") \
  .location("/tmp/delta/people10m") \
  .execute()

Out[5]: <delta.tables.DeltaTable at 0x7f821a60c220>

##2. sql

In [0]:
%sql
CREATE TABLE IF NOT EXISTS default.people10m (
  id INT,
  firstName STRING,
  middleName STRING,
  lastName STRING,
  gender STRING,
  birthDate TIMESTAMP,
  ssn STRING,
  salary INT
) USING DELTA



In [0]:
%sql
CREATE OR REPLACE TABLE default.people10m (
  id INT,
  firstName STRING,
  middleName STRING,
  lastName STRING,
  gender STRING,
  birthDate TIMESTAMP,
  ssn STRING,
  salary INT
) USING DELTA

In [0]:
%sql
-- Create or replace table with path
CREATE OR REPLACE TABLE delta.'/tmp/delta/people10m' (
  id INT,
  firstName STRING,
  middleName STRING,
  lastName STRING,
  gender STRING,
  birthDate TIMESTAMP,
  ssn STRING,
  salary INT
) USING DELTA

In [0]:
%sql
CREATE TABLE employee_demo(
  emp_id INT,
  emp_name STRING,
  gender STRING,
  salary INT,
  dept STRING
)USING DELTA

In [0]:
%sql
CREATE TABLE IF NOT EXISTS employee_demo(
  emp_id INT,
  emp_name STRING,
  gender STRING,
  salary INT,
  dept STRING
)USING DELTA

####### Incase of Pyspark createIfNotExists and Incase of sql IF NOT EXISTS.

###3.DataFrame Api

In [0]:
employee_data = [(100,"Stephen","M",1000,"IT"),
                 (200,"Mark","M",2000,"HR"),
                 (300,"Rafel","F",3000,"Sales")
                 ]
employee_schema = ["emp_id","emp_name","gender","salary","dept"]
df = spark.createDataFrame(data= employee_data,schema=employee_schema)

df.show()

+------+--------+------+------+-----+
|emp_id|emp_name|gender|salary| dept|
+------+--------+------+------+-----+
|   100| Stephen|     M|  1000|   IT|
|   200|    Mark|     M|  2000|   HR|
|   300|   Rafel|     F|  3000|Sales|
+------+--------+------+------+-----+



In [0]:
# Create Table in the metastore using Dataframe Schema and write data to it
df.write.format("delta").saveAsTable("employee_demo_1")

In [0]:
%sql
select * from employee_demo_1

emp_id,emp_name,gender,salary,dept
300,Rafel,F,3000,Sales
100,Stephen,M,1000,IT
200,Mark,M,2000,HR


##### Control data location
###### For tables defined in the metastore, you can optionally specify the LOCATION as a path. Tables created with a specified LOCATION are considered unmanaged by the metastore. Unlike a managed table, where no path is specified, an unmanaged table’s files are not deleted when you DROP the table.

In [0]:
%sql
CREATE TABLE default.people_location
USING DELTA
LOCATION '/tmp/delta/people_location'


#### Generated columns

In [0]:
from delta.tables import *
from pyspark.sql.functions import DataType 
DeltaTable.createIfNotExists(spark) \
  .tableName("default.people_generate_column") \
  .addColumn("id", "INT") \
  .addColumn("firstName", "STRING") \
  .addColumn("middleName", "STRING") \
  .addColumn("lastName", "STRING", comment = "surname") \
  .addColumn("gender", "STRING") \
  .addColumn("birthDate", "TIMESTAMP") \
  .addColumn("dateOfBirth", "DATE", generatedAlwaysAs="CAST(birthDate AS DATE)") \
  .addColumn("ssn", "STRING") \
  .addColumn("salary", "INT") \
  .partitionedBy("gender") \
  .execute()

Out[13]: <delta.tables.DeltaTable at 0x7f11ef37df10>

In [0]:
%sql 
select * from people_generate_column

id,firstName,middleName,lastName,gender,birthDate,dateOfBirth,ssn,salary


#### Generate partition filters

In [0]:
from delta.tables import *
from pyspark.sql.functions import DataType 

DeltaTable.create(spark) \
  .tableName("default.events") \
  .addColumn("eventId", "BIGINT") \
  .addColumn("data", "STRING") \
  .addColumn("eventType", "STRING") \
  .addColumn("eventTime", "TIMESTAMP") \
  .addColumn("eventDate", "DATE", generatedAlwaysAs="CAST(eventTime AS DATE)") \
  .partitionedBy("eventType", "eventDate") \
  .execute()

Out[20]: <delta.tables.DeltaTable at 0x7f11ef366730>

In [0]:

spark.sql('SELECT * FROM default.events WHERE eventTime >= "2020-10-01 00:00:00" <= "2020-10-01 12:00:00"')

Out[21]: DataFrame[eventId: bigint, data: string, eventType: string, eventTime: timestamp, eventDate: date]

In [0]:
from delta.tables import *
from pyspark.sql.functions import DataType 

DeltaTable.create(spark) \
  .tableName("default.events_02") \
  .addColumn("eventId", "BIGINT") \
  .addColumn("data", "STRING") \
  .addColumn("eventType", "STRING") \
  .addColumn("eventTime", "TIMESTAMP") \
  .addColumn("year", "INT", generatedAlwaysAs="YEAR(eventTime)") \
  .addColumn("month", "INT", generatedAlwaysAs="MONTH(eventTime)") \
  .addColumn("day", "INT", generatedAlwaysAs="DAY(eventTime)") \
  .partitionedBy("eventType", "year", "month", "day") \
  .execute()

Out[22]: <delta.tables.DeltaTable at 0x7f11ef2782b0>

In [0]:
spark.sql('SELECT * FROM default.events WHERE eventTime >= "2020-10-01 00:00:00" <= "2020-10-01 12:00:00"')

Out[26]: DataFrame[eventId: bigint, data: string, eventType: string, eventTime: timestamp, eventDate: date]