In [5]:
%session_id_prefix native-delta-dataframe-
%glue_version 3.0
%idle_timeout 60
%%configure
{
  "--conf": "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog",
  "--datalake-formats": "delta"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.4 
Setting session ID prefix to native-delta-dataframe-
Setting Glue version to: 3.0
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog', '--datalake-formats': 'delta'}


In [10]:
bucket_name = "lakehouse-deltalake"
bucket_prefix = "/data"
database_name = "tpc_h_sf1"
database_prefix = f"{bucket_prefix}/{database_name}"
database_location = f"s3://{bucket_name}/{database_prefix}/"
table_name = "customers"
table_prefix = f"{database_prefix}/{table_name}"
table_location = f"s3://{bucket_name}/{table_prefix}/"




In [2]:
import boto3

## Delete files in S3
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
bucket.objects.filter(Prefix=f"{table_prefix}/").delete()

## Drop tables in Glue Data Catalog
try:
    glue = boto3.client('glue')
    glue.delete_table(DatabaseName=database_name, Name=table_name)
except glue.exceptions.EntityNotFoundException:
    print(f"Table {database_name}.{table_name} does not exist")
try:
    glue = boto3.client('glue')
    glue.delete_table(DatabaseName=database_name, Name='testTable')
except glue.exceptions.EntityNotFoundException:
    print(f"Table {database_name}.testTable does not exist")

Table tpc-h-sf1.customers does not exist
Table tpc-h-sf1.testTable does not exist


## Create Database in Glue Catalog

In [11]:
try:
    glue = boto3.client('glue')
    res = glue.get_database(Name=database_name)
    print(f"Database {database_name} exists.")
    if 'LocationUri' not in res['Database']:
        print(f"Warning: Database {database_name} does not have Location. You need to configure location in the database.")
except glue.exceptions.EntityNotFoundException:
    print(f"Database {database_name} does not exist.")
    glue = glue.create_database(
        DatabaseInput={
            'Name': database_name,
            'LocationUri': database_location
        }
    )
    print(f"Created a new database {database_name}.")

Database tpc_h_sf1 does not exist.
Created a new database tpc_h_sf1.


In [7]:
from pyspark.sql import Row
import time

ut = time.time()

df_customers = spark.read.options(delimiter="|", header=False).csv("s3://tpc-h-dataset/sf1/customer/")
df_customers.show()

+---+------------------+--------------------+---+---------------+-------+----------+--------------------+----+
|_c0|               _c1|                 _c2|_c3|            _c4|    _c5|       _c6|                 _c7| _c8|
+---+------------------+--------------------+---+---------------+-------+----------+--------------------+----+
|  1|Customer#000000001|   IVhzIApeRb ot,c,E| 15|25-989-741-2988| 711.56|  BUILDING|to the even, regu...|null|
|  2|Customer#000000002|XSTf4,NCwDVaWNe6t...| 13|23-768-687-3665| 121.65|AUTOMOBILE|l accounts. blith...|null|
|  3|Customer#000000003|        MG9kdTD2WBHm|  1|11-719-748-3364|7498.12|AUTOMOBILE| deposits eat sly...|null|
|  4|Customer#000000004|         XxVSJsLAGtn|  4|14-128-190-5944|2866.83| MACHINERY| requests. final,...|null|
|  5|Customer#000000005|KvpyuHCplrB84WgAi...|  3|13-750-942-6364| 794.47| HOUSEHOLD|n accounts will h...|null|
|  6|Customer#000000006|sKZz0CsnMD7mp4Xd0...| 20|30-114-968-4951|7638.57|AUTOMOBILE|tions. even depos...|null|
|

In [12]:
# Create table in the metastore using DataFrame's schema and write data to it
df_customers.write.format("delta").mode("overwrite").option("path",table_location).saveAsTable(f"{database_name}.{table_name}")




In [13]:
# query table in the metastore
df_customers_read = spark.table(f"{database_name}.{table_name}")
df_customers_read.show()

+---+------------------+--------------------+---+---------------+-------+----------+--------------------+----+
|_c0|               _c1|                 _c2|_c3|            _c4|    _c5|       _c6|                 _c7| _c8|
+---+------------------+--------------------+---+---------------+-------+----------+--------------------+----+
|  1|Customer#000000001|   IVhzIApeRb ot,c,E| 15|25-989-741-2988| 711.56|  BUILDING|to the even, regu...|null|
|  2|Customer#000000002|XSTf4,NCwDVaWNe6t...| 13|23-768-687-3665| 121.65|AUTOMOBILE|l accounts. blith...|null|
|  3|Customer#000000003|        MG9kdTD2WBHm|  1|11-719-748-3364|7498.12|AUTOMOBILE| deposits eat sly...|null|
|  4|Customer#000000004|         XxVSJsLAGtn|  4|14-128-190-5944|2866.83| MACHINERY| requests. final,...|null|
|  5|Customer#000000005|KvpyuHCplrB84WgAi...|  3|13-750-942-6364| 794.47| HOUSEHOLD|n accounts will h...|null|
|  6|Customer#000000006|sKZz0CsnMD7mp4Xd0...| 20|30-114-968-4951|7638.57|AUTOMOBILE|tions. even depos...|null|
|

In [14]:
%%sql
SELECT COUNT(*) FROM tpc_h_sf1.customers;

+--------+
|count(1)|
+--------+
|  150000|
+--------+


In [16]:
%%sql
CREATE TABLE region
(
    R_REGIONKEY INT,
    R_NAME STRING,
    R_COMMENT STRING
) USING DELTA;

AnalysisException: Table default.region already exists


In [19]:
%%sql
CREATE TABLE nation
(
    N_NATIONKEY INT,
    N_NAME STRING,
    N_REGIONKEY INT,
    N_COMMENT STRING
) USING DELTA LOCATION "s3://lakehouse-deltalake/data/tpc_h_sf1/nation";

IllegalArgumentException: Can not create a Path from an empty string


In [24]:
query = """
CREATE TABLE tpc_h_sf1.orders (
    O_ORDERKEY INT,
    O_CUSTKEY INT,
    O_ORDERSTATUS INT,
    O_TOTALPRICE INT,
    O_ORDERDATE DATE,
    O_ORDERPRIORITY STRING,
    O_CLERK STRING,
    O_SHIPPRIORITY INT,
    O_COMMENT STRING
) USING delta LOCATION 's3://lakehouse-deltalake/data/tpc_h_sf1/orders'
"""
spark.sql(query)

DataFrame[]
