In [7]:
%streaming
%iam_role arn:aws:iam::484183516222:role/LabRole
%region us-east-1
%number_of_workers 2
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog",
  "--datalake-formats": "delta"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
It looks like there is a newer version of the kernel available. The latest version is 1.0.6 and you have 1.0.4 installed.
Please run `pip install --upgrade aws-glue-sessions` to upgrade your kernel
Previous session type: etl
Setting new session type to Streaming
Current iam_role is None
iam_role has been set to arn:aws:iam::484183516222:role/LabRole.
Previous region: None
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: None
Setting new number of workers to: 2
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=io.delta.sql.Delt

In [1]:
spark

Trying to create a Glue session for the kernel.
Session Type: streaming
Worker Type: G.1X
Number of Workers: 2
Session ID: c65af7f2-7743-4c2d-9d7d-d0ed144e3680
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session c65af7f2-7743-4c2d-9d7d-d0ed144e3680 to get into ready status...
Session c65af7f2-7743-4c2d-9d7d-d0ed144e3680 has been created.
<pyspark.sql.session.SparkSession object at 0x7fc9eb9afb80>


# 1. Environment Configuration

## 1.1 Import dependencies

In [2]:
import boto3
import json
import os
from uuid import uuid4
from datetime import datetime
from datetime import timedelta
import time

import pyspark.sql.types as t
import pyspark.sql.functions as f




## 1.2 Constants Variables

In [3]:
BUCKET_NAME = "vrpoptimiserplatform"
ORDERS = "orders"
STREAM_NAME = "orders_stream"

BRONZE = "bronze"
SILVER = "silver"
GOLD = "gold"

ORDERS_TABLE = "orders_table"

CHECKPOINT_LOCATION = f"s3a://{BUCKET_NAME}/{ORDERS}/checkpoints/orders"

BRONZE_ORDERS_PATH = f"s3a://{BUCKET_NAME}/{ORDERS}/{BRONZE}/{ORDERS_TABLE}"

SILVER_ORDERS_PATH = f"s3a://{BUCKET_NAME}/{ORDERS}/{SILVER}/{ORDERS_TABLE}"

GOLD_ORDERS_PATH = f"s3a://{BUCKET_NAME}/{ORDERS}/{GOLD}/{ORDERS_TABLE}"




# 2. Stream Interaction with Kinesis

## 2.1 Read Stream

> startingPosition indica el punto de incio de lectura del stream, sus opciones son:
>  * trim_horizon: primer dato del stream o el primer registro incluido en el checkpoint
>  * latest: ignora lo previo al ultimo registro

In [4]:
schema = t.StructType([
    t.StructField("event_id", t.StringType(), True),
    t.StructField("event_type", t.StringType(), True),
    t.StructField("event_timestamp", t.StringType(), True),
    t.StructField("order_id", t.StringType(), True),
    t.StructField("order_details", t.StructType([
        t.StructField("customer_id", t.StringType(), True),
        t.StructField("order_date", t.StringType(), True),
        t.StructField("items", t.ArrayType(t.StructType([
            t.StructField("product_id", t.StringType(), True),
            t.StructField("product_name", t.StringType(), True),
            t.StructField("price", t.DoubleType(), True),
            t.StructField("weight", t.DoubleType(), True),
            t.StructField("quantity", t.IntegerType(), True),
        ])), True),
        t.StructField("total_amount", t.DoubleType(), True),
        t.StructField("total_weight", t.DoubleType(), True),
        t.StructField("status", t.StringType(), True),
        t.StructField("destination_address", t.StructType([
            t.StructField("address_id", t.StringType(), True),
            t.StructField("neighborhood", t.StringType(), True),
            t.StructField("coordinates", t.ArrayType(t.DoubleType()), True),
            t.StructField("road", t.StringType(), True),
            t.StructField("house_number", t.StringType(), True),
            t.StructField("suburb", t.StringType(), True),
            t.StructField("city_district", t.StringType(), True),
            t.StructField("state", t.StringType(), True),
            t.StructField("postcode", t.StringType(), True),
            t.StructField("country", t.StringType(), True),
            t.StructField("lat", t.DoubleType(), True),
            t.StructField("lon", t.DoubleType(), True)
        ]), True),
        t.StructField("payment_details", t.StructType([
            t.StructField("payment_method", t.StringType(), True),
            t.StructField("payment_status", t.StringType(), True),
            t.StructField("transaction_id", t.StringType(), True)
        ]), True)
    ]), True)
])




In [5]:
kinesis_order_stream = (
    spark
    .readStream
    .format("kinesis")
    .option("streamName", STREAM_NAME)
    .option("startingPosition", "trim_horizon")
    # .option("startingPosition", "latest")
    .load()
)




In [6]:
kinesis_order_stream.printSchema()

root
 |-- data: binary (nullable = true)
 |-- streamName: string (nullable = true)
 |-- partitionKey: string (nullable = true)
 |-- sequenceNumber: string (nullable = true)
 |-- approximateArrivalTimestamp: timestamp (nullable = true)


In [7]:
df_order_stream = (
    kinesis_order_stream
    .withColumn("json_data", f.col("data").cast("string"))
    .withColumn("views", f.from_json("json_data", schema))
    .select("views.*")
)




In [8]:
df_order_stream.printSchema()

root
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_timestamp: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_details: struct (nullable = true)
 |    |-- customer_id: string (nullable = true)
 |    |-- order_date: string (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- product_id: string (nullable = true)
 |    |    |    |-- product_name: string (nullable = true)
 |    |    |    |-- price: double (nullable = true)
 |    |    |    |-- weight: double (nullable = true)
 |    |    |    |-- quantity: integer (nullable = true)
 |    |-- total_amount: double (nullable = true)
 |    |-- total_weight: double (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- destination_address: struct (nullable = true)
 |    |    |-- address_id: string (nullable = true)
 |    |    |-- neighborhood: string (nullable = true)
 |    |    |

In [9]:
order_stream = (
    df_order_stream 
      .writeStream
      .format("parquet")
      .outputMode("append")
      .trigger(processingTime='15 seconds')
      # .trigger(availableNow=True)
      .option("path", BRONZE_ORDERS_PATH)
      .option("checkpointLocation", CHECKPOINT_LOCATION)
      .start()
)




In [10]:
order_stream.isActive

True


In [11]:
order_stream.status

{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}


In [12]:
order_stream.lastProgress




In [None]:
order_stream.stop()

In [13]:
(
    spark.read
    .format("parquet")
    .load(BRONZE_ORDERS_PATH)
    .count()
)

288


In [14]:
df_orders_bronze = (
    spark.read
    .format("parquet")
    .load(BRONZE_ORDERS_PATH)
)




In [15]:
df_orders_bronze.show()

+--------------------+-------------+-------------------+--------------------+--------------------+
|            event_id|   event_type|    event_timestamp|            order_id|       order_details|
+--------------------+-------------+-------------------+--------------------+--------------------+
|ev-6b7587c1-1cb7-...|ORDER_CREATED|2024-06-23 16:19:19|ord-5e8b03c5-f916...|{cus-fa1e4bf5-0d0...|
|ev-673ef19f-8c5c-...|ORDER_CREATED|2024-06-23 16:19:26|ord-f26ad03a-4e19...|{cus-40fae667-b78...|
|ev-abb492e3-dc0e-...|ORDER_CREATED|2024-06-23 16:19:32|ord-a595c12c-e73d...|{cus-ad7098ff-de1...|
|ev-308a173e-c3ac-...|ORDER_CREATED|2024-06-23 16:19:39|ord-b37a714b-aaa6...|{cus-311a7951-574...|
|ev-337be020-46bf-...|ORDER_CREATED|2024-06-23 16:19:46|ord-5edb868e-5481...|{cus-b5ff74ca-cb3...|
|ev-57068625-1eb9-...|ORDER_CREATED|2024-06-23 16:19:52|ord-7dc2e7c5-13ad...|{cus-aa39738d-7ee...|
|ev-86ff429a-64d3-...|ORDER_CREATED|2024-06-23 16:19:59|ord-75be955b-7659...|{cus-e45d02fb-662...|
|ev-a06c8a

In [16]:
(
    df_orders_bronze
    .select(
        f.col("order_id"),
        f.col("order_details.total_weight").alias("total_weight"),
        f.col("order_details.destination_address.address_id").alias("address_id"),
        f.col("order_details.destination_address.neighborhood").alias("neighborhood"),
        f.col("order_details.destination_address.coordinates").alias("coordinates"),
        f.col("order_details.destination_address.road").alias("road"),
        f.col("order_details.destination_address.house_number").alias("house_number"),
        f.col("order_details.destination_address.suburb").alias("suburb"),
        f.col("order_details.destination_address.city_district").alias("city_district"),
        f.col("order_details.destination_address.state").alias("state"),
        f.col("order_details.destination_address.postcode").alias("postcode"),
        f.col("order_details.destination_address.country").alias("country"),
        f.col("order_details.destination_address.lat").alias("lat"),
        f.col("order_details.destination_address.lon").alias("lon"),
    )
    .show()
)

+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------------------+-------------------+--------+-------+------------------+-------------------+
|            order_id|       total_weight|          address_id|        neighborhood|         coordinates|                road|house_number|              suburb|      city_district|              state|postcode|country|               lat|                lon|
+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------------------+-------------------+--------+-------+------------------+-------------------+
|ord-5e8b03c5-f916...|  4.140000090003014|cus-f936d114-7b08...|              Tetuán|[40.4636955817725...|  Calle de Villaamil|            |              Tetuán|                   |Comunidad de Madrid|   28039| España| 40.46369171

In [18]:
(
    df_orders_bronze
    .withColumn("exploded_order", f.explode(f.col("order_details.items")))
    .select(
        f.col("order_id"),
        f.col("exploded_order.product_id").alias("itemId"),
        f.col("exploded_order.product_name").alias("productName"),
        f.col("exploded_order.quantity").alias("quantity"),
        f.col("exploded_order.price").alias("price"),
        f.col("exploded_order.weight").alias("weight"),
    )
    .show()
)

+--------------------+--------------------+--------------------+--------+------+--------------------+
|            order_id|              itemId|         productName|quantity| price|              weight|
+--------------------+--------------------+--------------------+--------+------+--------------------+
|ord-5e8b03c5-f916...|pro-0febdb7595e97...|Rubie's Costume F...|       1|  7.98| 0.10999999940395355|
|ord-5e8b03c5-f916...|pro-c2b2335e3a5b1...|Tamiya 78024 IJN ...|       2|133.45|  1.3200000524520874|
|ord-5e8b03c5-f916...|pro-32bf000c61137...|Bestway 43187E Ca...|       1|  10.5|  1.3899999856948853|
|ord-f26ad03a-4e19...|pro-935926f754951...|Dart Gun - Avenge...|       2|   7.2|  0.3199999928474426|
|ord-f26ad03a-4e19...|pro-5915fb6ac04be...|Mattel Enchantima...|       1| 13.97| 0.10000000149011612|
|ord-a595c12c-e73d...|pro-460313f815d26...|BePuzzled Origina...|       1| 10.74| 0.10999999940395355|
|ord-b37a714b-aaa6...|pro-28509105510aa...|Rubies Monster Hi...|       2| 16.66| 0

In [19]:
# weight treshold 
(
    df_orders_bronze
    .select(
        f.sum(f.col("order_details.total_weight")).alias("total_weight")
    ).collect()[0]["total_weight"]
)

2139.0200025383383


## OLD TESTS 

### Do not run

In [None]:
kinesis_stream = (
    spark
    .readStream
    .format("kinesis")
    .option("streamName", STREAM_NAME)
    .option("startingPosition", "trim_horizon")
    # .option("startingPosition", "latest")
    .load()
)

In [None]:
kinesis_stream.printSchema()

In [None]:
stream = (
    kinesis_stream.writeStream
    .format("memory")
    .queryName("kinesis")
    .option("checkpointLocation", f"s3://{BUCKET_NAME}/orders/checkpoints")
    .start()
)

In [None]:
spark.table("kinesis").show()

In [None]:
df_orders_stream = (
    kinesis_stream
    .withColumn("json_data", f.col("data").cast("string"))
    .withColumn("views", f.from_json("json_data", schema))
    .select("views.*")
)

In [None]:
df_orders_stream.printSchema()

In [None]:
check_num = 1

In [None]:
orders_stream = (
    df_orders_stream.writeStream
    .format("memory")
    .queryName("orders_messages")
    .option("checkpointLocation", f"s3://{BUCKET_NAME}/orders/checkpoints/orders_messages-processed_{str(check_num)}")
    .start()
)

In [None]:
spark.table("orders_messages")

In [None]:
spark.streams.active[0].stop()

In [None]:
(
    kinesis_stream
    .withColumn("json_data", f.col("data").cast("string"))
    .withColumn("views", f.from_json("json_data", schema))
    .select("views.*")
)