In [1]:
import sys
import os
import yaml
import configparser

# Add the src directory to the sys.path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

Consideraciones:
* generar schema de validacion de streams
* introducir crdenciales a traves del entorno de docker - ENV variables en .ipynb no funcionan
* optimizar filtrado de id incluir particion?
* structura de micro servicios: inventory consumer - producer
* dise√±o de sistemas: filtrar ordenes efectuadas para los siguientes envios
* confirmar Best pracitces para el manejo de streams: memorytable vs readStream
* confirmar best practice para pertion key del stream
* Arquitectura Lambda
* builling de S3 por fichero registrado: definir criterios de particion
* emplear memory format para lectura de streams
* best practices para gestionar checkpoints
* firehose implementarlo

# 1. Environment Configuration

## 1.1 Import dependencies

In [2]:
import boto3
import json
import os
from uuid import uuid4
from datetime import datetime
from datetime import timedelta
import time
import random
import uuid
import logging
import numpy as np

from kafka import KafkaProducer
import pyspark.sql.types as t
import pyspark.sql.functions as f

In [3]:
from spark_session import create_spark_session
from schemas import *
from functions import *

## 1.2 Extract AWS credentials

In [4]:
def load_aws_credentials(profile_name="default"):

    # Load credentials from the .aws/credentials file (local development)
    try:
        credentials = configparser.ConfigParser()
        credentials.read(os.path.join('..', '.aws', 'credentials'))
        
        logging.info("Successfully loaded credentials variables from .aws file.")
    except Exception as e:
        logging.error(f"Error loading .aws file: {e}")
        sys.exit(1)

    aws_access_key_id = credentials[profile_name]["aws_access_key_id"]
    aws_secret_access_key = credentials[profile_name]["aws_secret_access_key"]

    if not aws_access_key_id or not aws_secret_access_key:
        logging.error("AWS credentials not found.")
        sys.exit(1)

    return aws_access_key_id, aws_secret_access_key

aws_access_key_id, aws_secret_access_key = load_aws_credentials()

## 1.3 Constants variables

In [5]:
# Initialize logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

In [6]:
def load_aws_config():
    """
    Loads AWS configuration settings from the .aws/config file.

    :param profile_name: The profile name in the AWS config file (default: "default").
    :return: The region_name as a string.
    """
    try:
        config = configparser.ConfigParser()
        config.read(os.path.join('..', '.aws', 'config'))
        logging.info("Successfully loaded config variables from .aws file.")

        return config
    except Exception as e:
        logging.error(f"Error loading .aws file: {e}")
        sys.exit(1)

config = load_aws_config()

In [7]:
BUCKET_NAME = config["paths"]["BUCKET_NAME"]
RAW = config["paths"]["RAW"]
ORDERS = config["paths"]["ORDERS"]

BRONZE = config["paths"]["BRONZE"]
SILVER = config["paths"]["SILVER"]
GOLD = config["paths"]["GOLD"]

ADDRESS_DATA = config["raw_data"]["ADDRESS_DATA"]
CLIENTS_DATA = config["raw_data"]["CLIENTS_DATA"]
PRODUCTS_DATA = config["raw_data"]["PRODUCTS_DATA"]

ADDRESS_TABLE = config["table_names"]["ADDRESS_TABLE"]
CLIENTS_TABLE = config["table_names"]["CLIENTS_TABLE"]
CLIENTS_ADDRESS_TABLE = config["table_names"]["CLIENTS_ADDRESS_TABLE"]
PRODUCTS_TABLE = config["table_names"]["PRODUCTS_TABLE"]
PACKAGE_TABLE = config["table_names"]["PACKAGE_TABLE"]

RAW_ADDRESS_PATH = os.path.join(BUCKET_NAME, RAW, ADDRESS_DATA)
RAW_CIENTS_PATH = os.path.join(BUCKET_NAME, RAW, CLIENTS_DATA)
RAW_PRODUCTS_PATH = os.path.join(BUCKET_NAME, RAW, PRODUCTS_DATA)

BRONZE_ADDRESS_PATH = os.path.join(BUCKET_NAME, ORDERS, BRONZE, ADDRESS_TABLE)
BRONZE_CLIENTS_PATH = os.path.join(BUCKET_NAME, ORDERS, BRONZE, CLIENTS_TABLE)
BRONZE_PRODUCTS_PATH = os.path.join(BUCKET_NAME, ORDERS, BRONZE, PRODUCTS_TABLE)


SILVER_ADDRESS_PATH = os.path.join(BUCKET_NAME, ORDERS, SILVER, ADDRESS_TABLE)
SILVER_CLIENTS_PATH = os.path.join(BUCKET_NAME, ORDERS, SILVER, CLIENTS_TABLE)
SILVER_PRODUCTS_PATH = os.path.join(BUCKET_NAME, ORDERS, SILVER, PRODUCTS_TABLE)

GOLD_CLIENTS_ADDRESS_PATH = os.path.join(BUCKET_NAME, ORDERS, GOLD, CLIENTS_ADDRESS_TABLE)
GOLD_PRODUCTS_PATH = os.path.join(BUCKET_NAME, ORDERS, GOLD, PRODUCTS_TABLE)
GOLD_PACKAGE_PATH = os.path.join(BUCKET_NAME, ORDERS, GOLD, PACKAGE_TABLE)

In [8]:
# stream_name = "orders_stream_2" #config["default"]["STREAM_NAME"]
TOPIC_NAME = "order_stream"
BOOTSTRAP_SERVER = "51.92.77.20:9092"

# 2. Initialize Spark Session

In [None]:
spark = create_spark_session(aws_access_key_id, aws_secret_access_key)

# 2. Data generation

In [None]:
def generate_order_payload(order_details):
    """
    Generate a payload for an order event.

    :param order_details: Dictionary containing order details.
    :return: Dictionary containing the payload for the order event.
    """
    return {
        "event_id": f"ev-{uuid.uuid4()}",
        "event_type": "ORDER_CREATED",
        "event_timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        "order_id": f"ord-{uuid.uuid4()}",
        "order_details": order_details
    }

In [None]:
df_clients_address = read_file(spark, GOLD_CLIENTS_ADDRESS_PATH, "parquet", gold_clients_address_schema)
df_products = read_file(spark, GOLD_PRODUCTS_PATH, "delta")#spark.read.format("delta").load(GOLD_PRODUCTS_PATH)
df_packages = read_file(spark, GOLD_PACKAGE_PATH, "delta") #spark.read.format("delta").load(GOLD_PACKAGE_PATH)

In [None]:
order_payload = generate_order_payload(generate_order_details(df_clients_address, df_products, df_packages))
print(json.dumps(order_payload, indent=4))

# 3. Stream Producer Kafka

In [None]:
def produce_order(bootstrap_server, payload, topic_name = "order_stream"):
    try:
        # Ensure payload is correctly formatted and partition key is a string
        if 'event_type' not in payload or not isinstance(payload['event_type'], str):
            raise ValueError("Payload must include 'event_type' as a string")

        producer = producer = KafkaProducer(
            bootstrap_servers=[BOOTSTRAP_SERVER]
        )
        
        data = json.dumps(payload).encode()
        put_response = producer.send(
            topic_name,
            key=payload['event_type'].encode(),
            value=data,
        )
        
        # Log response details
        logger.info(f"Put record response: {put_response}")
        return put_response
    except Exception as e:
        logger.error(f"Failed to put record to stream: {e}", exc_info=True)
        return None

In [None]:
order_payload = generate_order_payload(generate_order_details(df_clients_address, df_products, df_packages))
produce_order(BOOTSTRAP_SERVER, order_payload, TOPIC_NAME)

# 3. Stream producer Kinesis

In [None]:
# kinesis_client = boto3.client(
#     'kinesis', 
#     aws_access_key_id=aws_access_key_id,
#     aws_secret_access_key=aws_secret_access_key,
#     # aws_session_token=aws_session_token,
#     region_name='eu-south-2'
# )

In [None]:
# response = kinesis_client.describe_stream(StreamName=stream_name)
# print(response['StreamDescription']['StreamStatus'])

In [None]:
# print(response['StreamDescription']['Shards'])

In [None]:
# # def produce_order(payload):
#     try:
#         # Ensure payload is correctly formatted and partition key is a string
#         if 'event_type' not in payload or not isinstance(payload['event_type'], str):
#             raise ValueError("Payload must include 'event_type' as a string")
        
#         data = json.dumps(payload)
#         put_response = kinesis_client.put_record(
#             StreamName=stream_name,
#             Data=f"{data}\n",
#             PartitionKey=payload['event_type']
#         )
        
#         # Log response details
#         logger.info(f"Put record response: {put_response}")
#         return put_response
#     except Exception as e:
#         logger.error(f"Failed to put record to stream: {e}", exc_info=True)
#         return None

In [None]:
# order_payload = generate_order_payload(generate_order_details(df_clients_address, df_products, df_packages))
# print(json.dumps(order_payload, indent=4))

In [None]:
# order_payload = generate_order_payload(generate_order_details(df_clients_address, df_products, df_packages))
# produce_order(order_payload)

In [None]:
# Produce orders at regular intervals (for example, every second)
# while True:
#     order_payload = order_payload = generate_order_payload(generate_order_details(df_clients_address, df_products, df_packages))
#     produce_order(order_payload)
#     time.sleep(5)