In [6]:
%streaming
%iam_role arn:aws:iam::484183516222:role/LabRole
%region us-east-1
%number_of_workers 2
%idle_timeout 60

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
It looks like there is a newer version of the kernel available. The latest version is 1.0.6 and you have 1.0.4 installed.
Please run `pip install --upgrade aws-glue-sessions` to upgrade your kernel
Previous session type: etl
Setting new session type to Streaming
Current iam_role is None
iam_role has been set to arn:aws:iam::484183516222:role/LabRole.
Previous region: None
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: None
Setting new number of workers to: 2
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.


In [1]:
spark

Trying to create a Glue session for the kernel.
Session Type: streaming
Worker Type: G.1X
Number of Workers: 2
Session ID: 8d17a0f8-ff0a-43b7-9770-00d3455b3a99
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
Waiting for session 8d17a0f8-ff0a-43b7-9770-00d3455b3a99 to get into ready status...
Session 8d17a0f8-ff0a-43b7-9770-00d3455b3a99 has been created.
<pyspark.sql.session.SparkSession object at 0x7fa0a8bc8d30>


# 1. Environment Configuration

In [2]:
import boto3
import json
import os
from uuid import uuid4
from datetime import datetime
from datetime import timedelta
import time

import pyspark.sql.types as t
import pyspark.sql.functions as f




In [8]:
BUCKET_NAME = "vrpoptimiserplatform"
ORDERS = "orders"
STREAM_NAME = "orders_stream"

BRONZE = "bronze"
SILVER = "silver"
GOLD = "gold"

ORDERS_TABLE = "orders_table"

CHECKPOINT_LOCATION = f"s3a://{BUCKET_NAME}/{ORDERS}/checkpoints/orders"

BRONZE_ORDERS_PATH = f"s3a://{BUCKET_NAME}/{ORDERS}/{BRONZE}/{ORDERS_TABLE}"

SILVER_ORDERS_PATH = f"s3a://{BUCKET_NAME}/{ORDERS}/{SILVER}/{ORDERS_TABLE}"

GOLD_ORDERS_PATH = f"s3a://{BUCKET_NAME}/{ORDERS}/{GOLD}/{ORDERS_TABLE}"




# 2. Stream Interaction with Kinesis

## 2.1 Read Stream

> startingPosition indica el punto de incio de lectura del stream, sus opciones son:
>  * trim_horizon: primer dato del stream o el primer registro incluido en el checkpoint
>  * latest: ignora lo previo al ultimo registro

In [4]:
schema = t.StructType([
    t.StructField("eventId", t.StringType(), True),
    t.StructField("eventType", t.StringType(), True),
    t.StructField("eventTimestamp", t.StringType(), True),
    t.StructField("orderId", t.StringType(), True),
    t.StructField("orderDetails", t.StructType([
        t.StructField("customerId", t.StringType(), True),
        t.StructField("orderDate", t.StringType(), True),
        t.StructField("items", t.ArrayType(t.StructType([
            t.StructField("itemId", t.StringType(), True),
            t.StructField("productName", t.StringType(), True),
            t.StructField("quantity", t.IntegerType(), True),
            t.StructField("price", t.DoubleType(), True),
            t.StructField("weight", t.DoubleType(), True)
        ])), True),
        t.StructField("totalAmount", t.DoubleType(), True),
        t.StructField("totalWeight", t.DoubleType(), True),
        t.StructField("status", t.StringType(), True),
        t.StructField("destinationAddress", t.StructType([
            t.StructField("address_id", t.StringType(), True),
            t.StructField("neighborhood", t.StringType(), True),
            t.StructField("coordinates", t.ArrayType(t.DoubleType()), True),
            t.StructField("road", t.StringType(), True),
            t.StructField("house_number", t.StringType(), True),
            t.StructField("suburb", t.StringType(), True),
            t.StructField("city_district", t.StringType(), True),
            t.StructField("state", t.StringType(), True),
            t.StructField("postcode", t.StringType(), True),
            t.StructField("country", t.StringType(), True),
            t.StructField("lat", t.DoubleType(), True),
            t.StructField("lon", t.DoubleType(), True)
        ]), True),
        t.StructField("paymentDetails", t.StructType([
            t.StructField("paymentMethod", t.StringType(), True),
            t.StructField("paymentStatus", t.StringType(), True),
            t.StructField("transactionId", t.StringType(), True)
        ]), True)
    ]), True)
])




In [5]:
kinesis_order_stream = (
    spark
    .readStream
    .format("kinesis")
    .option("streamName", STREAM_NAME)
    .option("startingPosition", "trim_horizon")
    # .option("startingPosition", "latest")
    .load()
)




In [9]:
df_order_stream = (
    kinesis_order_stream
    .withColumn("json_data", f.col("data").cast("string"))
    .withColumn("views", f.from_json("json_data", schema))
    .select("views.*")
)




In [10]:
order_stream = (
    df_order_stream 
      .writeStream
      .format("parquet")
      .outputMode("append")
      .trigger(processingTime='15 seconds')
      # .trigger(availableNow=True)
      .option("path", BRONZE_ORDERS_PATH)
      .option("checkpointLocation", CHECKPOINT_LOCATION)
      .start()
)




In [44]:
order_stream.isActive

True


In [45]:
order_stream.status

{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}


In [47]:
order_stream.stop()




In [43]:
(
    spark.read
    .format("parquet")
    .load(BRONZE_ORDERS_PATH)
    .count()
)

980


In [17]:
df_orders_bronze = (
    spark.read
    .format("parquet")
    .load(BRONZE_ORDERS_PATH)
)




In [18]:
df_orders_bronze.show()

+--------------------+-------------+-------------------+--------------------+--------------------+
|             eventId|    eventType|     eventTimestamp|             orderId|        orderDetails|
+--------------------+-------------+-------------------+--------------------+--------------------+
|ev-62e507a9-3284-...|ORDER_CREATED|2024-06-14 09:33:32|ord-5ec09f73-c069...|{cus-3aca0361-293...|
|ev-8ee7e5ea-99f3-...|ORDER_CREATED|2024-06-14 09:33:38|ord-09242d35-fdfd...|{cus-6ec54ac2-167...|
|ev-ac980014-e862-...|ORDER_CREATED|2024-06-14 09:33:44|ord-36e562c9-aa74...|{cus-abf2f169-c3f...|
|ev-c26d13b0-b0f1-...|ORDER_CREATED|2024-06-14 09:32:47|ord-6fe080fa-495e...|{cus-3770be50-7c8...|
|ev-2f2102dc-b954-...|ORDER_CREATED|2024-06-14 09:32:53|ord-015cb998-ad1a...|{cus-e178e747-9be...|
|ev-ec46118f-bdd6-...|ORDER_CREATED|2024-06-14 09:32:59|ord-1a43bcbb-74f9...|{cus-2f367720-d75...|
|ev-60fd6ae4-0164-...|ORDER_CREATED|2024-06-14 09:31:46|ord-b64e3c2c-5860...|{cus-248155a3-63d...|
|ev-63cc8a

In [24]:
(
    df_orders_bronze
    .select(
        f.col("orderId"),
        f.col("orderDetails.totalWeight").alias("totalWeight"),
        f.col("orderDetails.destinationAddress.address_id").alias("address_id"),
        f.col("orderDetails.destinationAddress.neighborhood").alias("neighborhood"),
        f.col("orderDetails.destinationAddress.coordinates").alias("coordinates"),
        f.col("orderDetails.destinationAddress.road").alias("road"),
        f.col("orderDetails.destinationAddress.house_number").alias("house_number"),
        f.col("orderDetails.destinationAddress.suburb").alias("suburb"),
        f.col("orderDetails.destinationAddress.city_district").alias("city_district"),
        f.col("orderDetails.destinationAddress.state").alias("state"),
        f.col("orderDetails.destinationAddress.postcode").alias("postcode"),
        f.col("orderDetails.destinationAddress.country").alias("country"),
        f.col("orderDetails.destinationAddress.lat").alias("lat"),
        f.col("orderDetails.destinationAddress.lon").alias("lon"),
    )
    .show()
)

+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------------------+-------------------+--------+-------+------------------+-------------------+
|             orderId|totalWeight|          address_id|        neighborhood|         coordinates|                road|house_number|              suburb|      city_district|              state|postcode|country|               lat|                lon|
+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------------------+-------------------+--------+-------+------------------+-------------------+
|ord-5ec09f73-c069...|       54.7|cus-d8b58826-c538...|           Salamanca|[40.4315787171295...|Calle del Príncip...|          53|           Salamanca|                   |Comunidad de Madrid|   28006| España| 40.43220901489258|-3.6803221702575684|
|ord

In [29]:
(
    df_orders_bronze
    .withColumn("exploded_order", f.explode(f.col("orderDetails.items")))
    .select(
        f.col("orderId"),
        f.col("exploded_order.itemId").alias("itemId"),
        f.col("exploded_order.productName").alias("productName"),
        f.col("exploded_order.quantity").alias("quantity"),
        f.col("exploded_order.price").alias("price"),
        f.col("exploded_order.weight").alias("weight"),
    )
    .show()
)

+--------------------+--------------------+-----------+--------+------+------+
|             orderId|              itemId|productName|quantity| price|weight|
+--------------------+--------------------+-----------+--------+------+------+
|ord-5ec09f73-c069...|111e2222-e33b-44d...|    armario|       1|699.99|  23.5|
|ord-5ec09f73-c069...|222e3333-e44b-55d...|      silla|       2| 49.99|  15.6|
|ord-09242d35-fdfd...|111e2222-e33b-44d...|    armario|       1|699.99|  23.5|
|ord-09242d35-fdfd...|222e3333-e44b-55d...|      silla|       2| 49.99|  15.6|
|ord-36e562c9-aa74...|111e2222-e33b-44d...|    armario|       1|699.99|  23.5|
|ord-36e562c9-aa74...|222e3333-e44b-55d...|      silla|       2| 49.99|  15.6|
|ord-6fe080fa-495e...|111e2222-e33b-44d...|    armario|       1|699.99|  23.5|
|ord-6fe080fa-495e...|222e3333-e44b-55d...|      silla|       2| 49.99|  15.6|
|ord-015cb998-ad1a...|111e2222-e33b-44d...|    armario|       1|699.99|  23.5|
|ord-015cb998-ad1a...|222e3333-e44b-55d...|      sil

In [42]:
# weight treshold 
(
    df_orders_bronze
    .select(
        f.sum(f.col("orderDetails.totalWeight")).alias("totalWeight")
    ).collect()[0]["totalWeight"]
)

8314.399999999998


## OLD TESTS 

### Do not run

In [None]:
kinesis_stream = (
    spark
    .readStream
    .format("kinesis")
    .option("streamName", STREAM_NAME)
    .option("startingPosition", "trim_horizon")
    # .option("startingPosition", "latest")
    .load()
)




In [None]:
kinesis_stream.printSchema()

root
 |-- data: binary (nullable = true)
 |-- streamName: string (nullable = true)
 |-- partitionKey: string (nullable = true)
 |-- sequenceNumber: string (nullable = true)
 |-- approximateArrivalTimestamp: timestamp (nullable = true)


In [None]:
stream = (
    kinesis_stream.writeStream
    .format("memory")
    .queryName("kinesis")
    .option("checkpointLocation", f"s3://{BUCKET_NAME}/orders/checkpoints")
    .start()
)




In [None]:
spark.table("kinesis").show()

+--------------------+-------------+--------------------+--------------------+---------------------------+
|                data|   streamName|        partitionKey|      sequenceNumber|approximateArrivalTimestamp|
+--------------------+-------------+--------------------+--------------------+---------------------------+
|[7B 22 65 76 65 6...|orders_stream|ord-26deb37f-37ab...|49652844364975988...|       2024-06-10 16:36:...|
|[7B 22 65 76 65 6...|orders_stream|ord-af26e714-634b...|49652844364975988...|       2024-06-10 16:37:...|
|[7B 22 65 76 65 6...|orders_stream|ord-af26e714-634b...|49652844364975988...|       2024-06-10 16:42:...|
|[7B 22 65 76 65 6...|orders_stream|       ORDER_CREATED|49652844364975988...|       2024-06-10 16:49:...|
|[7B 22 65 76 65 6...|orders_stream|       ORDER_CREATED|49652844364975988...|       2024-06-10 16:57:...|
|[7B 22 65 76 65 6...|orders_stream|       ORDER_CREATED|49652844364975988...|       2024-06-10 16:58:...|
|[7B 22 65 76 65 6...|orders_stream| 

In [16]:
df_orders_stream = (
    kinesis_stream
    .withColumn("json_data", f.col("data").cast("string"))
    .withColumn("views", f.from_json("json_data", schema))
    .select("views.*")
)




In [17]:
df_orders_stream.printSchema()

root
 |-- eventId: string (nullable = true)
 |-- eventType: string (nullable = true)
 |-- eventTimestamp: string (nullable = true)
 |-- orderId: string (nullable = true)
 |-- orderDetails: struct (nullable = true)
 |    |-- customerId: string (nullable = true)
 |    |-- orderDate: string (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- itemId: string (nullable = true)
 |    |    |    |-- productName: string (nullable = true)
 |    |    |    |-- quantity: integer (nullable = true)
 |    |    |    |-- price: double (nullable = true)
 |    |    |    |-- weight: double (nullable = true)
 |    |-- totalAmount: double (nullable = true)
 |    |-- totalWeight: double (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- destinationAddress: struct (nullable = true)
 |    |    |-- address_id: string (nullable = true)
 |    |    |-- neighborhood: string (nullable = true)
 |    |    |-- coordinates:

In [19]:
check_num = 1




In [20]:
orders_stream = (
    df_orders_stream.writeStream
    .format("memory")
    .queryName("orders_messages")
    .option("checkpointLocation", f"s3://{BUCKET_NAME}/orders/checkpoints/orders_messages-processed_{str(check_num)}")
    .start()
)




In [23]:
spark.table("orders_messages")

+--------------------+-------------+-------------------+--------------------+--------------------+
|             eventId|    eventType|     eventTimestamp|             orderId|        orderDetails|
+--------------------+-------------+-------------------+--------------------+--------------------+
|ev-983c9229-39cf-...|ORDER_CREATED|2024-06-10 16:35:38|ord-26deb37f-37ab...|{cus-9a6459ad-ca3...|
|ev-b5dbeaa3-5ef8-...|ORDER_CREATED|2024-06-10 16:37:29|ord-af26e714-634b...|{cus-60e5dfe0-02a...|
|ev-b5dbeaa3-5ef8-...|ORDER_CREATED|2024-06-10 16:37:29|ord-af26e714-634b...|{cus-60e5dfe0-02a...|
|ev-847168d8-c80f-...|ORDER_CREATED|2024-06-10 16:49:03|ord-54e07435-a302...|{cus-ae432b6f-bbf...|
|ev-ed47f0b3-c999-...|ORDER_CREATED|2024-06-10 16:57:46|ord-52f43dab-233d...|{cus-fc4ace38-637...|
|ev-ed47f0b3-c999-...|ORDER_CREATED|2024-06-10 16:57:46|ord-52f43dab-233d...|{cus-fc4ace38-637...|
|ev-4c91e918-8220-...|ORDER_CREATED|2024-06-10 17:47:02|ord-ba23c6fa-1857...|{cus-32814c81-4d1...|
|ev-7240bf

In [29]:
spark.streams.active[0].stop()




In [None]:
(
    kinesis_stream
    .withColumn("json_data", f.col("data").cast("string"))
    .withColumn("views", f.from_json("json_data", schema))
    .select("views.*")
)