# Integraci√≥n de Datos en Streaming

Este cuadrerno implementa el Motor de Ingesta en Streaming.

El cuaderno contiene la configuraci√≥n independiente por dataset, la lista de topics, particionado configurable y las consultas activas. Todo ello escribe en la capa bronze.


In [0]:

import time, datetime, random

account = spark.conf.get("adls.account.name")

organization = 'FarmIA' #Carpta ra√≠z en la que quedar√°n los ficheros
landing_container = f"abfss://landing-tarea@{account}.dfs.core.windows.net"
lakehouse_container = f"abfss://lakehouse-tarea@{account}.dfs.core.windows.net"

landing_path = landing_container
raw_path = f"{lakehouse_container}/raw"
bronze_path = f"{lakehouse_container}/bronze"

dbutils.fs.mkdirs(raw_path)
dbutils.fs.mkdirs(bronze_path)

Definimos los par√°metros del dataset l√≥gicos.

In [0]:
datasource = "FarmIA"
dataset = "orders"
topic="orders"

dataset_landing_path = f"{landing_path}/{datasource}/{dataset}"
dataset_raw_path =  f"{raw_path}/{datasource}/{dataset}"
dataset_bronze_path = f"{bronze_path}/{datasource}/{dataset}"
dataset_bronze_checkpoint_path = f"{bronze_path}/{datasource}/{dataset}_checkpoint"
table_name = f"hive_metastore.bronze.{datasource}_{dataset}"

print(dataset_landing_path)
print(dataset_raw_path)
print(dataset_bronze_path)

Leemos ahora la configuraci√≥n *client_properties* con las credenciales de Confluent Cloud.

In [0]:
def read_config():
  config = {}
  with open("/dbfs/FileStore/client_properties") as fh:
    for line in fh:
      line = line.strip()
      if len(line) != 0 and line[0] != "#":
        parameter, value = line.strip().split('=', 1)
        config[parameter] = value.strip()
  return config

conf = read_config()

A continuaci√≥n, creamos un streaming dataframe con origen kafka. El esquema lo tomamos del ejercicio del m√≥dulo.

In [0]:
from confluent_kafka.admin import AdminClient, NewTopic

admin_client = AdminClient({
  "bootstrap.servers": conf["bootstrap.servers"],
  "security.protocol": conf["security.protocol"],
  "sasl.mechanisms":   conf["sasl.mechanisms"],
  "sasl.username":     conf["sasl.username"],
  "sasl.password":     conf["sasl.password"]
})

print("Topics en el cluster de Kafka:")
topic_list = admin_client.list_topics().topics 
list(topic_list.keys())[:20]

Creamos el topic si no existe y nos aseguramos de ello.

In [0]:
topic = "orders"
fs = admin_client.create_topics([NewTopic(topic, num_partitions=3, replication_factor=3)])

for t, f in fs.items(): 
    try: 
        f.result()
        print(f"Topic {t} creado")
    except Exception as e: 
        print(f"Info topic {t}: {e}")  

Definimos a continuaci√≥n el esquema JSON de los mensajes del topic.

In [0]:
json_schema_ddl = """
order_id STRING,
customer_id STRING,
items ARRAY<STRUCT<product:STRING, qty:INT, price:DOUBLE>>,
total_amount DOUBLE,
currency STRING,
ts LONG
"""

Generamos el DataFrame de streaming desde Kafka.

In [0]:
import pyspark.sql.functions as F

kafka_options = {
      "kafka.bootstrap.servers": conf["bootstrap.servers"],
      "kafka.security.protocol": conf["security.protocol"],
      "kafka.sasl.mechanism":   conf["sasl.mechanisms"],
      "kafka.sasl.jaas.config":
              f"""kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="{conf.get('sasl.username')}" password="{conf.get('sasl.password')}"; """,
      "subscribe": "orders",             
      "startingOffsets": "earliest" ,
      "kafka.ssl.endpoint.identification.algorithm": "https"     
}

base = (spark
      .readStream
      .format("kafka") 
      .options(**kafka_options)
      .load()
      .selectExpr(
          "CAST(key AS STRING)   AS key",
          "CAST(value AS STRING) AS value",
          "timestamp",
          "topic",
          "partition",
          "offset"
      )
)

In [0]:
df = (base
    .withColumn("value", F.from_json(F.col("value"), json_schema_ddl))
    .select(
        "key",
        F.col("value.order_id").alias("order_id"),
        F.col("value.customer_id").alias("customer_id"),
        F.col("value.items").alias("items"),
        F.col("value.total_amount").alias("total_amount"),
        F.col("value.currency").alias("currency"),
        F.col("value.ts").alias("ts"),
        "timestamp","topic","partition","offset"
    )
    .withColumn("_ingested_at", F.current_timestamp())
    .withColumn("event_date",   F.to_date(F.col("timestamp")))
    .withColumnRenamed("topic",     "_topic")
    .withColumnRenamed("partition", "_partition")
    .withColumnRenamed("offset",    "_offset")
    .withColumnRenamed("timestamp", "_timestamp")
)


Escribimos a continuaci√≥n en Bronze.

In [0]:
(df.writeStream
      .format("delta")
      .outputMode("append")
      .option("mergeSchema", "true")
      .option("checkpointLocation", dataset_bronze_checkpoint_path)
      .partitionBy("event_date","_topic")
      .trigger(once=True)            
      .queryName(f"bronze-{datasource}-{dataset}")     
      .start(dataset_bronze_path)   
      .awaitTermination()
)

In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS hive_metastore.bronze")
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {table_name}
USING DELTA
LOCATION '{dataset_bronze_path}'
""")

Generamos a continuaci√≥n mensajes aleatorios en el topic *orders*.\
 Lo limitaremos a 100 para que la celda termine.

In [0]:
from confluent_kafka import Producer
import json, time, random, uuid

producer = Producer({
    "bootstrap.servers": conf["bootstrap.servers"],
    "security.protocol": conf["security.protocol"],
    "sasl.mechanisms":   conf["sasl.mechanisms"],
    "sasl.username":     conf["sasl.username"],
    "sasl.password":     conf["sasl.password"],
})

def random_item():
    """Genera un producto aleatorio (product, qty, price)"""
    products = ["fertilizers","pesticides","insecticides","seeds","gloves"]
    product = random.choice(products)
    qty = random.randint(1, 5)
    price = round(random.uniform(5.0, 50.0), 2)
    return {"product": product, "qty": qty, "price": price}

def random_order():
    """Crea un pedido con campos alineados al JSON schema del stream"""
    n_items = random.randint(1, 4)
    items = [random_item() for _ in range(n_items)]
    total = round(sum(i["qty"] * i["price"] for i in items), 2)
    return {
        "order_id": str(uuid.uuid4())[:12],
        "customer_id": f"C-{random.randint(1, 1000):05d}",
        "items": items,
        "total_amount": total,
        "currency": random.choice(["EUR","USD"]),
        "ts": int(time.time()*1000)
    }

N=100
delay = 0.5

print(f"üöÄ Enviando {N} mensajes aleatorios al topic '{topic}' (Ctrl+Break para parar)")

for i in range(1, N + 1):
    order = random_order()
    producer.produce(topic, key=order["customer_id"], value=json.dumps(order))
    producer.poll(0)
    print(f"[{i}/{N}] Sent: {order['order_id']} | total={order['total_amount']} {order['currency']}")
    time.sleep(delay)

producer.flush(10)
print(f"‚úÖ Generaci√≥n completada ({N} mensajes enviados).")