In [5]:
%session_id_prefix native-delta-dataframe-
%glue_version 3.0
%idle_timeout 60
%%configure
{
  "--conf": "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog",
  "--datalake-formats": "delta"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.4 
Setting session ID prefix to native-delta-dataframe-
Setting Glue version to: 3.0
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog', '--datalake-formats': 'delta'}


In [1]:
bucket_name = "lakehouse-deltalake"
bucket_prefix = "/data"
database_name = "tpc_h_sf1"
database_prefix = f"{bucket_prefix}/{database_name}"
database_location = f"s3://{bucket_name}/{database_prefix}/"
table_name = "customers"
table_prefix = f"{database_prefix}/{table_name}"
table_location = f"s3://{bucket_name}/{table_prefix}/"

Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Session ID: e6bec54d-b749-43a9-84c7-5cff7ccc68e1
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session e6bec54d-b749-43a9-84c7-5cff7ccc68e1 to get into ready status...
Session e6bec54d-b749-43a9-84c7-5cff7ccc68e1 has been created.



In [2]:
import boto3

## Delete files in S3
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
bucket.objects.filter(Prefix=f"{table_prefix}/").delete()

## Drop tables in Glue Data Catalog
try:
    glue = boto3.client('glue')
    glue.delete_table(DatabaseName=database_name, Name=table_name)
except glue.exceptions.EntityNotFoundException:
    print(f"Table {database_name}.{table_name} does not exist")
try:
    glue = boto3.client('glue')
    glue.delete_table(DatabaseName=database_name, Name='testTable')
except glue.exceptions.EntityNotFoundException:
    print(f"Table {database_name}.{table_name} does not exist")

Table tpc_h_sf1.customers does not exist
Table tpc_h_sf1.testTable does not exist


## Create Database in Glue Catalog

In [4]:
try:
    glue = boto3.client('glue')
    res = glue.get_database(Name=database_name)
    print(f"Database {database_name} exists.")
    if 'LocationUri' not in res['Database']:
        print(f"Warning: Database {database_name} does not have Location. You need to configure location in the database.")
except glue.exceptions.EntityNotFoundException:
    print(f"Database {database_name} does not exist.")
    glue = glue.create_database(
        DatabaseInput={
            'Name': database_name,
            'LocationUri': database_location
        }
    )
    print(f"Created a new database {database_name}.")

Database tpc_h_sf1 exists.


In [5]:
from pyspark.sql import Row
import time

ut = time.time()

df_customers = spark.read.options(delimiter="|", header=False).csv("s3://tpc-h-dataset/sf1/customer/")
df_customers.show()

+---+------------------+--------------------+---+---------------+-------+----------+--------------------+----+
|_c0|               _c1|                 _c2|_c3|            _c4|    _c5|       _c6|                 _c7| _c8|
+---+------------------+--------------------+---+---------------+-------+----------+--------------------+----+
|  1|Customer#000000001|   IVhzIApeRb ot,c,E| 15|25-989-741-2988| 711.56|  BUILDING|to the even, regu...|null|
|  2|Customer#000000002|XSTf4,NCwDVaWNe6t...| 13|23-768-687-3665| 121.65|AUTOMOBILE|l accounts. blith...|null|
|  3|Customer#000000003|        MG9kdTD2WBHm|  1|11-719-748-3364|7498.12|AUTOMOBILE| deposits eat sly...|null|
|  4|Customer#000000004|         XxVSJsLAGtn|  4|14-128-190-5944|2866.83| MACHINERY| requests. final,...|null|
|  5|Customer#000000005|KvpyuHCplrB84WgAi...|  3|13-750-942-6364| 794.47| HOUSEHOLD|n accounts will h...|null|
|  6|Customer#000000006|sKZz0CsnMD7mp4Xd0...| 20|30-114-968-4951|7638.57|AUTOMOBILE|tions. even depos...|null|
|

In [6]:
# Create table in the metastore using DataFrame's schema and write data to it
df_customers.write.format("delta").mode("overwrite").option("path",table_location).saveAsTable(f"{database_name}.{table_name}")




In [7]:
# query table in the metastore
df_customers_read = spark.table(f"{database_name}.{table_name}")
df_customers_read.show()

+---+------------------+--------------------+---+---------------+-------+----------+--------------------+----+
|_c0|               _c1|                 _c2|_c3|            _c4|    _c5|       _c6|                 _c7| _c8|
+---+------------------+--------------------+---+---------------+-------+----------+--------------------+----+
|  1|Customer#000000001|   IVhzIApeRb ot,c,E| 15|25-989-741-2988| 711.56|  BUILDING|to the even, regu...|null|
|  2|Customer#000000002|XSTf4,NCwDVaWNe6t...| 13|23-768-687-3665| 121.65|AUTOMOBILE|l accounts. blith...|null|
|  3|Customer#000000003|        MG9kdTD2WBHm|  1|11-719-748-3364|7498.12|AUTOMOBILE| deposits eat sly...|null|
|  4|Customer#000000004|         XxVSJsLAGtn|  4|14-128-190-5944|2866.83| MACHINERY| requests. final,...|null|
|  5|Customer#000000005|KvpyuHCplrB84WgAi...|  3|13-750-942-6364| 794.47| HOUSEHOLD|n accounts will h...|null|
|  6|Customer#000000006|sKZz0CsnMD7mp4Xd0...| 20|30-114-968-4951|7638.57|AUTOMOBILE|tions. even depos...|null|
|

In [14]:
%%sql
SELECT COUNT(*) FROM tpc_h_sf1.customers;

+--------+
|count(1)|
+--------+
|  150000|
+--------+


In [16]:
%%sql
CREATE TABLE region
(
    R_REGIONKEY INT,
    R_NAME STRING,
    R_COMMENT STRING
) USING DELTA;

AnalysisException: Table default.region already exists


In [19]:
%%sql
CREATE TABLE nation
(
    N_NATIONKEY INT,
    N_NAME STRING,
    N_REGIONKEY INT,
    N_COMMENT STRING
) USING DELTA LOCATION "s3://lakehouse-deltalake/data/tpc_h_sf1/nation";

IllegalArgumentException: Can not create a Path from an empty string


In [52]:
query = """
CREATE TABLE tpc_h_sf1.orders (
    O_ORDERKEY INT,
    O_CUSTKEY INT,
    O_ORDERSTATUS STRING,
    O_TOTALPRICE DOUBLE,
    O_ORDERDATE DATE,
    O_ORDERPRIORITY STRING,
    O_CLERK STRING,
    O_SHIPPRIORITY INT,
    O_COMMENT STRING
) USING delta LOCATION 's3://lakehouse-deltalake/data/tpc_h_sf1/orders'
"""
spark.sql(query)

DataFrame[]


In [8]:
from delta.tables import *
deltaTable = DeltaTable.forPath("s3://lakehouse-deltalake/data/tpc_h_sf1/customers")
deltaTable.generate("symlink_format_manifest")

TypeError: forPath() missing 1 required positional argument: 'path'


In [58]:
%%sql
GENERATE symlink_format_manifest FOR TABLE delta.`s3://lakehouse-deltalake/data/tpc_h_sf1/orders`

++
||
++
++


In [53]:
df_orders = spark.read.options(delimiter="|", header=False, inferSchema=True).csv("s3://tpc-h-dataset/sf1/orders/")
existing_table_df = spark.read.format("delta").load("s3://lakehouse-deltalake/data/tpc_h_sf1/orders")
#df_orders = df_orders.drop(df_orders.columns[-1])
#df_orders = df_orders.toDF(*existing_table_df.columns)
df_orders.show()
print(df_orders.schema)
print(existing_table_df.schema)
print("---")
df_orders = df_orders.drop(df_orders.columns[-1])
df_orders = df_orders.toDF(*existing_table_df.columns)
print(df_orders.schema)
print(existing_table_df.schema)
#df_orders.write.format("delta").mode("overwrite").option("path","s3://lakehouse-deltalake/data/tpc_h_sf1/orders").saveAsTable(f"{database_name}.orders")

AnalysisException: Failed to merge fields 'O_ORDERDATE' and 'O_ORDERDATE'. Failed to merge incompatible data types DateType and StringType


In [54]:
# Read the schema of the Delta table
delta_table_schema = spark.read.format("delta").load("s3://lakehouse-deltalake/data/tpc_h_sf1/orders").schema

# Define the schema for the CSV file based on the Delta table schema
csv_schema = StructType([StructField(field.name, field.dataType, field.nullable) for field in delta_table_schema])
print(csv_schema)

# Read the CSV file into a DataFrame with defined schema
df_orders = spark.read.options(delimiter="|", header=False, inferSchema=True).schema(csv_schema).csv("s3://tpc-h-dataset/sf1/orders/")

df_orders.write.format("delta").mode("overwrite").option("path","s3://lakehouse-deltalake/data/tpc_h_sf1/orders").saveAsTable(f"{database_name}.orders")

StructType(List(StructField(O_ORDERKEY,IntegerType,true),StructField(O_CUSTKEY,IntegerType,true),StructField(O_ORDERSTATUS,StringType,true),StructField(O_TOTALPRICE,DoubleType,true),StructField(O_ORDERDATE,DateType,true),StructField(O_ORDERPRIORITY,StringType,true),StructField(O_CLERK,StringType,true),StructField(O_SHIPPRIORITY,IntegerType,true),StructField(O_COMMENT,StringType,true)))


In [55]:
df_customers_read = spark.table(f"{database_name}.orders")
df_customers_read.show()

+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
|O_ORDERKEY|O_CUSTKEY|O_ORDERSTATUS|O_TOTALPRICE|O_ORDERDATE|O_ORDERPRIORITY|        O_CLERK|O_SHIPPRIORITY|           O_COMMENT|
+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
|   3600001|   106660|            O|   130445.43| 1995-10-25|       3-MEDIUM|Clerk#000000776|             0|nusual pinto bean...|
|   3600002|   106873|            O|   226263.36| 1997-06-26|       1-URGENT|Clerk#000000337|             0|ctions sleep busi...|
|   3600003|   112288|            F|   110840.45| 1993-06-19|       3-MEDIUM|Clerk#000000319|             0|ts. final, unusua...|
|   3600004|    59149|            O|    45849.03| 1997-12-24|          5-LOW|Clerk#000000011|             0|s haggle against ...|
|   3600005|    42071|            O|   124317.01| 1996-10-04|          5-LOW|Clerk#0000001

In [50]:
%%sql
DROP TABLE tpc_h_sf1.orders

++
||
++
++


In [56]:
df_orders = spark.read.options(delimiter="|", header=False, inferSchema=True).csv("s3://tpc-h-dataset/sf1/orders/")
df_orders.show()

+-------+------+---+---------+----------+---------------+---------------+---+--------------------+----+
|    _c0|   _c1|_c2|      _c3|       _c4|            _c5|            _c6|_c7|                 _c8| _c9|
+-------+------+---+---------+----------+---------------+---------------+---+--------------------+----+
|1200001|121361|  F| 60106.33|1994-01-24|       1-URGENT|Clerk#000000340|  0|ourts are careful...|null|
|1200002|  1775|  O|194561.08|1996-12-06|         2-HIGH|Clerk#000000709|  0|ts. ironic sheave...|null|
|1200003|122593|  F| 10061.57|1994-01-23|         2-HIGH|Clerk#000000141|  0|bravely final acc...|null|
|1200004|  6394|  O|206408.82|1996-06-10|       3-MEDIUM|Clerk#000000545|  0|posits wake caref...|null|
|1200005| 57130|  F|234800.62|1995-01-04|         2-HIGH|Clerk#000000473|  0|tes. fluffily eve...|null|
|1200006|  7114|  F|140726.22|1993-10-30|          5-LOW|Clerk#000000634|  0|thely regular packag|null|
|1200007|146152|  O|167797.91|1998-04-18|       1-URGENT|Clerk#0

In [57]:
%%sql
SELECT * FROM tpc_h_sf1.orders WHERE O_ORDERKEY = "1200002";

+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
|O_ORDERKEY|O_CUSTKEY|O_ORDERSTATUS|O_TOTALPRICE|O_ORDERDATE|O_ORDERPRIORITY|        O_CLERK|O_SHIPPRIORITY|           O_COMMENT|
+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
|   1200002|     1775|            O|   194561.08| 1996-12-06|         2-HIGH|Clerk#000000709|             0|ts. ironic sheave...|
+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
