In [1]:
%load_ext dotenv
%dotenv
%iam_role arn:aws:iam::684969100054:role/AdminAccessGlueNotebook
%region eu-west-1
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 10
%connections tpcds-connector-glue3

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glue_context = GlueContext(sc)
spark = glue_context.spark_session
job = Job(glue_context)

Current iam_role is arn:aws:iam::684969100054:role/aws-reserved/sso.amazonaws.com/eu-west-2/AWSReservedSSO_AdministratorAccess_ab408ccf26c25b37
iam_role has been set to arn:aws:iam::684969100054:role/AdminAccessGlueNotebook.
Previous region: eu-west-1
Setting new region to: eu-west-1
Reauthenticating Glue client with new region: eu-west-1
IAM role has been set to arn:aws:iam::684969100054:role/AdminAccessGlueNotebook. Reauthenticating.
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::684969100054:role/AdminAccessGlueNotebook
Authentication done.
Region is set to: eu-west-1
Current idle_timeout is 2880 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 3.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 10
Connections to be included:
tpcds-connector-glue3
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 10
Session

In [2]:
dyf = glue_context.create_dynamic_frame.from_options(
            connection_type="marketplace.spark",
            connection_options={
                "table": "time_dim",
                "scale": 1,
                "numPartitions": 36,
                "connectionName": "tpcds-connector-glue3",
            },
            transformation_ctx="dyf",
        )

dyf.show(10)

{"t_time_sk": 0, "t_time_id": "AAAAAAAABAAAAAAA", "t_time": 0, "t_hour": 0, "t_minute": 0, "t_second": 0, "t_am_pm": "AM", "t_shift": "third", "t_sub_shift": "night", "t_meal_time": ""}
{"t_time_sk": 1, "t_time_id": "AAAAAAAACAAAAAAA", "t_time": 1, "t_hour": 0, "t_minute": 0, "t_second": 1, "t_am_pm": "AM", "t_shift": "third", "t_sub_shift": "night", "t_meal_time": ""}
{"t_time_sk": 2, "t_time_id": "AAAAAAAADAAAAAAA", "t_time": 2, "t_hour": 0, "t_minute": 0, "t_second": 2, "t_am_pm": "AM", "t_shift": "third", "t_sub_shift": "night", "t_meal_time": ""}
{"t_time_sk": 3, "t_time_id": "AAAAAAAAEAAAAAAA", "t_time": 3, "t_hour": 0, "t_minute": 0, "t_second": 3, "t_am_pm": "AM", "t_shift": "third", "t_sub_shift": "night", "t_meal_time": ""}
{"t_time_sk": 4, "t_time_id": "AAAAAAAAFAAAAAAA", "t_time": 4, "t_hour": 0, "t_minute": 0, "t_second": 4, "t_am_pm": "AM", "t_shift": "third", "t_sub_shift": "night", "t_meal_time": ""}
{"t_time_sk": 5, "t_time_id": "AAAAAAAAGAAAAAAA", "t_time": 5, "t_hour

In [2]:
bucket_name = "sb-test-bucket-ireland"
folder_name = "soumaya/tpcds"
scale = 1
partitions = 36
database_prefix = f"sm_tpcds_"
extraction_timestamp = "2022-01-01"

tables = [
    # "call_center", "catalog_returns",
    # "catalog_page", "catalog_sales",
    # "customer", "customer_address", "customer_demographics",
    # "date_dim", "dbgen_version", "household_demographics",
    # "income_band", "inventory", "item", "promotion", "reason",
    # "ship_mode", "store", "store_returns",
    "store_sales",
    "time_dim",
    "warehouse",
    # "web_page", "web_returns", "web_sales", "web_site"
]




In [14]:
! aws s3 cp tpcds_functions.py s3://sb-test-bucket-ireland/tpcds/

upload: ./tpcds_functions.py to s3://sb-test-bucket-ireland/tpcds/tpcds_functions.py


In [3]:
sc.addPyFile("s3://sb-test-bucket-ireland/tpcds/tpcds_functions.py")




In [3]:
from tpcds_functions import create_db_if_not_exists, generate_tables

ModuleNotFoundError: No module named 'tpcds_functions'


In [3]:
import boto3

def create_db_if_not_exists(database_prefix=f"tpcds_", scale=1):
    database_name = f"{database_prefix}{scale}"
    try:
        glue = boto3.client("glue")
        glue.create_database(DatabaseInput={"Name": database_name})
        print(f"New database {database_name} created")
    except glue.exceptions.AlreadyExistsException:
        print(f"Database {database_name} already exist")




In [4]:
create_db_if_not_exists(database_prefix=database_prefix, scale=scale)

Database sm_tpcds_1 already exist


In [11]:
import pyspark.sql.functions as f
from awsglue.dynamicframe import DynamicFrame

def generate_tables(
    glue_context,
    tables=["store_sales"],
    bucket_name="sb-test-bucket-ireland",
    folder_name="tpcds",
    database_prefix=f"tpcds_",
    scale=1,
    partitions=36,
    extraction_timestamp="2022-01-01",
):
    database_name = f"{database_prefix}{scale}"
    for table in tables:
        
        path = f"{folder_name}/scale={scale}/table={table}/"
        ## Delete files in S3
        s3 = boto3.resource("s3")
        bucket = s3.Bucket(bucket_name)
        bucket.objects.filter(
            Prefix=path
        ).delete()

        ## Drop table in Glue Data Catalog
        try:
            glue = boto3.client("glue")
            glue.delete_table(DatabaseName=database_name, Name=table)
            print(f"Table {database_name}.{table} dropped")
        except glue.exceptions.EntityNotFoundException:
            print(f"Table {database_name}.{table} does not exist")

        # Create a DynamicFrame for `table`
        dyf = glue_context.create_dynamic_frame.from_options(
            connection_type="marketplace.spark",
            connection_options={
                "table": table,
                "scale": scale,
                "numPartitions": partitions,
                "connectionName": "tpcds-connector-glue3",
            },
            transformation_ctx="dyf",
        )
        
        df = (
            dyf.toDF()
            .withColumn(
                "extraction_timestamp",
                f.to_timestamp(f.lit(extraction_timestamp), "yyyy-MM-dd"),
            )
            .withColumn("op", f.lit(None).cast("string"))
        )
        if table == "store_sales":
            # create single pk column
            df = df.withColumn(
                    "pk",
                    f.concat(
                        f.col("ss_ticket_number").cast("string"),
                        f.lit("-"),
                        f.col("ss_item_sk").cast("string"),
                    ),
                )
        dyf2 = DynamicFrame.fromDF(df, glue_context, "dyf2")

        # Write the DynamicFrame to S3 and register the table.
        sink = glue_context.getSink(
            connection_type="s3",
            path=f"s3://{bucket_name}/{path}",
            enableUpdateCatalog=True,
            updateBehavior="UPDATE_IN_DATABASE",
        )
        sink.setFormat("glueparquet")
        sink.setCatalogInfo(catalogDatabase=database_name, catalogTableName=table)
        sink.writeFrame(dyf2)
        print(f"Table {database_name}.{table} generated")




In [12]:
generate_tables(
    glue_context=glue_context,
    tables=tables,
    bucket_name=bucket_name,
    folder_name=folder_name,
    database_prefix=database_prefix,
    scale=scale,
    partitions=partitions,
    extraction_timestamp=extraction_timestamp,
)

Table sm_tpcds_1.store_sales dropped
Table sm_tpcds_1.store_sales generated
Table sm_tpcds_1.time_dim dropped
Table sm_tpcds_1.time_dim generated
Table sm_tpcds_1.warehouse dropped
Table sm_tpcds_1.warehouse generated


In [None]:
def generate_updates(
    glue_context,
    source_table = "store_sales",
    updated_days=[1],
    bucket_name="sb-test-bucket-ireland",
    folder_name="tpcds-updates",
    database_prefix=f"tpcds_",
    scale=1,
    extraction_timestamp="2022-01-02",
):
    database_name = f"{database_prefix}{scale}"
    
    for update in updated_days:
        path = f"{folder_name}/scale={scale}/table={source_table}/updated_days={update}/"
        table = f"{source_table}_{update}"
        ## Delete files in S3
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        bucket.objects.filter(Prefix=path).delete()
        
        ## Drop table in Glue Data Catalog
        try:
            glue = boto3.client('glue')
            glue.delete_table(DatabaseName=database_name, Name=table)
        except glue.exceptions.EntityNotFoundException:
            print(f"Table {database_name}.{table} does not exist")
        
        # Create a DynamicFrame for `table`
        dyf = glue_context.create_dynamic_frame.from_catalog(database=database_name, table_name=source_table)
        dyf_filter = dyf.filter( f=lambda x: x["ss_sold_date_sk"] in [2450816])
        
        df = (
            dyf_filter.toDF()
            .withColumn(
                "extraction_timestamp",
                f.to_timestamp(f.lit(extraction_timestamp), "yyyy-MM-dd"),
            )
            .withColumn("op", f.lit("U").cast("string"))
        )
        dyf2 = DynamicFrame.fromDF(df, glue_context, "dyf2")

        # Write the DynamicFrame to S3 and register the table.
        sink = glue_context.getSink(
            connection_type="s3", 
            path=f"s3://{bucket_name}/{path}updated_days={update}/",
            enableUpdateCatalog=True,
            updateBehavior="UPDATE_IN_DATABASE")   
        sink.setFormat("glueparquet")
        sink.setCatalogInfo(catalogDatabase=database_name, catalogTableName=table)
        sink.writeFrame(dyf2)