# 第 10 章: ストリーミング処理とスキーマ進化


In [None]:
import json
import time
from datetime import datetime
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType
CATALOG = "my_catalog"
CATALOG_URL = "http://server:8181/"
S3_ENDPOINT = "http://minio:9000"
SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.8.1"

In [None]:
spark = (
    SparkSession.builder
        .config("spark.jars.packages", 
                f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION},org.apache.spark:spark-sql-kafka-0-10_2.12:{SPARK_VERSION}")
        .config(f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog")
        .config(f"spark.sql.catalog.{CATALOG}.type", "rest")
        .config(f"spark.sql.catalog.{CATALOG}.uri", CATALOG_URL)
        .config(f"spark.sql.catalog.{CATALOG}.s3.endpoint", S3_ENDPOINT)
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
        .config("spark.sql.defaultCatalog", "my_catalog")
        .getOrCreate()
)

In [None]:
%sql spark

## 準備

* アクセスログを保存するための Iceberg テーブル `web_access_logs` を作成
* Kafka トピックの作成

### (Optional) データベースの作成
データベースを作成していない場合、以下のセルを実行してください。既にデータベースが存在する場合は、本ステップにつきましてはスキップしてください。

In [None]:
%%sql
CREATE DATABASE IF NOT EXISTS db

テーブル作成

In [None]:
%%sql
CREATE OR REPLACE TABLE db.web_access_logs (
    timestamp timestamp,
    ip_address string,
    path string,
    status_code int,
    user_agent string
) USING iceberg
PARTITIONED BY (day(timestamp))

### Kafka トピックの作成

In [None]:
from kafka.admin import KafkaAdminClient, NewTopic
KAFKA_TOPIC = 'web-access-logs2'


kafka_client = KafkaAdminClient(bootstrap_servers='kafka:29092', client_id=None)
topic = NewTopic(name=KAFKA_TOPIC, num_partitions=1, replication_factor=1)

# Create a new topic
kafka_client.create_topics(new_topics=[topic], validate_only=False)

## Web アクセスログを Spark Structured Streaming で Iceberg テーブルに書き込む
### Kafka プロデューサーからデータを送信する

In [None]:
from kafka import KafkaProducer
import random

In [None]:
# Function to produce sample web access logs to Kafka
web_log_producer = KafkaProducer(
    bootstrap_servers=['kafka:29092'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

50 個のログを送信する

In [None]:
paths = ["/home", "/products", "/about", "/contact"]
ip_addresses = ["192.168.1.10", "10.0.0.5", "172.16.0.3", "192.168.1.25"]
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
]

try:
    for i in range(50):
        log = {
            "timestamp": datetime.now().isoformat(),
            "ip_address": random.choice(ip_addresses),
            "path": random.choice(paths),
            "status_code": random.choice([200, 200, 200, 404, 500]),
            "user_agent": random.choice(user_agents)
        }
        web_log_producer.send(KAFKA_TOPIC, log)
        time.sleep(0.5)
    print("Completed to send 50 messages")
except Exception as e:
    raise e
finally:
    web_log_producer.flush()
    web_log_producer.close()

### Structured Streaming で Kafka からデータを取得する

In [None]:
# Read from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", KAFKA_TOPIC) \
    .option("startingOffsets", "latest") \
    .load()

In [None]:
# Define initial schema
initial_schema = StructType([
    StructField("timestamp", StringType(), False),
    StructField("ip_address", StringType(), False),
    StructField("path", StringType(), False),
    StructField("status_code", IntegerType(), False),
    StructField("user_agent", StringType(), True)
])

In [None]:
# Parse JSON and apply schema
df_processed = df.select(
    from_json(col("value").cast("string"), initial_schema).alias("data")
).select(
    to_timestamp(col("data.timestamp")).alias("timestamp"),
    col("data.ip_address"),
    col("data.path"),
    col("data.status_code"),
    col("data.user_agent")
)

### Iceberg テーブルにデータを書き込む

In [None]:
# Write to Iceberg
sq = df_processed.writeStream \
    .format("iceberg") \
    .outputMode("append") \
    .trigger(processingTime="10 seconds") \
    .option("checkpointLocation", "/tmp/iceberg-checkpoint/web-logs_tomtan") \
    .toTable("db.web_access_logs")

### 現在のアクセスログを集計する
Kafka から読み取り、Iceberg テーブルに書き込んだデータを試しに読んでみましょう。

In [None]:
%%sql
SELECT count(*) as cnt FROM db.web_access_logs

In [None]:
spark.sql("SELECT count(*) as cnt FROM db.web_access_logs").show()

In [None]:
%%sql
SELECT path, count(*) as access_cnt 
FROM db.web_access_logs
GROUP BY path
ORDER BY access_cnt DESC

In [None]:
spark.sql("""
SELECT path, count(*) as access_cnt 
FROM db.web_access_logs
GROUP BY path
ORDER BY access_cnt DESC""").show()

次のセクションでスキーマ変更が発生する前提での書き込み方法に切り替えるため、一度ストリーミングアプリケーションを停止します。

In [None]:
sq.stop()

## スキーマ進化したウェブアクセスログを書き込む

In [None]:
web_log_producer = KafkaProducer(
    bootstrap_servers=['kafka:29092'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

ウェブアプリケーション側の仕様変更によりスキーマが変更されたログが 50 メッセージ送信される

In [None]:
paths = ["/home", "/products", "/about", "/contact"]
ip_addresses = ["192.168.1.10", "10.0.0.5", "172.16.0.3", "192.168.1.25"]
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
]
user_ids = ["user123", "user456", "user789"]

try:
    for i in range(50):
        log = {
            "timestamp": datetime.now().isoformat(),
            "ip_address": random.choice(ip_addresses),
            "path": random.choice(paths),
            "status_code": random.choice([200, 200, 200, 404, 500]),
            "user_agent": random.choice(user_agents),
            "response_time_ms": random.randint(50, 2000),  # 追加されたカラム
            "user_id": random.choice(user_ids)  # 追加されたカラム
        }
        web_log_producer.send(KAFKA_TOPIC, log)
        time.sleep(0.5)
    print("Completed to send 50 messages")
except Exception as e:
    raise e
finally:
    web_log_producer.flush()
    web_log_producer.close()

### スキーマ進化したデータを読み込み Iceberg テーブルのスキーマを動的に変更しながら書き込む

In [None]:
%%sql
ALTER TABLE db.web_access_logs SET TBLPROPERTIES('write.spark.accept-any-schema'='true')

In [None]:
df_new_schema = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", KAFKA_TOPIC) \
    .option("startingOffsets", "latest") \
    .load()

In [None]:
# Schema inference をしながら、新たにデータを書き込む
def process_evolving_schema_logs(batch_df, batch_id):
    if batch_df.isEmpty():
        return

    def _process_kafka_value_for_schema(kafka_value_str):
        """Process Kafka value string and infer schema"""
        # Parse JSON string to dictionary
        parsed_dict = json.loads(kafka_value_str)
        return parsed_dict
    
    def _get_inferred_schema(df_str):
        """Infer schema from JSON records"""
        # Collect all JSON strings from the dataframe
        json_strings = [row.json_str for row in df_str.collect()]
        
        # Process each Kafka value string to dictionary
        dict_records = [_process_kafka_value_for_schema(json_string) for json_string in json_strings]
        
        # Create RDD from dictionaries and infer schema
        df_schema = spark.read.option('inferSchema', True).json(
            spark.sparkContext.parallelize([json.dumps(r) for r in dict_records])
        )
        return df_schema.schema

    # Parse all records with the inferred schema
    df_str = batch_df.select(col("value").cast("string").alias("json_str"))
    df_new_schema = batch_df.select(
        from_json(col("value").cast("string"), _get_inferred_schema(df_str)).alias("data")
    ).select("data.*")

    if "timestamp" in df_new_schema.columns:
        df_new_schema = df_new_schema.withColumn("timestamp", to_timestamp(col("timestamp")))

    # Write with schema merge enabled
    write_options = {
        "merge-schema": "true",  # スキーママージを有効にする
        "check-ordering": "false"  # カラム順番のチェックを無効化する
    }

    # Write to Iceberg table
    df_new_schema.writeTo("my_catalog.db.web_access_logs").options(**write_options).append()

In [None]:
sq_new_schema = df_new_schema.writeStream \
    .foreachBatch(process_evolving_schema_logs) \
    .trigger(processingTime="10 seconds") \
    .option("checkpointLocation", "/tmp/iceberg-checkpoint/web-logs_tomtan") \
    .start()

### テーブルスキーマを確認する

In [None]:
%%sql
DESCRIBE db.web_access_logs

In [None]:
%%sql
SELECT * FROM db.web_access_logs WHERE user_id IS NOT NULL

### スキーマ進化したデータを読み込み Iceberg テーブルのスキーマを動的に変更しながら書き込む Part 2

In [None]:
web_log_producer = KafkaProducer(
    bootstrap_servers=['kafka:29092'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

paths = ["/home", "/products", "/about", "/contact"]
ip_addresses = ["192.168.1.10", "10.0.0.5", "172.16.0.3", "192.168.1.25"]
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
]
user_ids = ["user123", "user456", "user789"]
device_type = ["mobile", "desktop", "tablet"] # 新たにアクセス元のデバイスタイプが追加される

try:
    for i in range(50):
        log = {
            "timestamp": datetime.now().isoformat(),
            "ip_address": random.choice(ip_addresses),
            "path": random.choice(paths),
            "status_code": random.choice([200, 200, 200, 404, 500]),
            "user_agent": random.choice(user_agents),
            "response_time_ms": random.randint(50, 2000),  
            "user_id": random.choice(user_ids),
            "device_type": random.choice(device_type) # 新たに追加されたカラム
        }
        web_log_producer.send(KAFKA_TOPIC, log)
        time.sleep(0.5)
    print("Completed to send 50 messages")
except Exception as e:
    raise e
finally:
    web_log_producer.flush()
    web_log_producer.close()

スキーマ確認

In [None]:
%%sql
DESCRIBE db.web_access_logs

In [None]:
spark.sql("DESCRIBE db.web_access_logs").show()

### 集計クエリを実行する

In [None]:
%%sql
SELECT count(*) as total_count FROM db.web_access_logs

In [None]:
%%sql
SELECT path, AVG(response_time_ms) as avg_response_time_ms
FROM db.web_access_logs
WHERE response_time_ms IS NOT NULL
GROUP BY path 
ORDER BY avg_response_time_ms DESC

In [None]:
%%sql
SELECT user_id, device_type, count(*) as access_cnt 
FROM db.web_access_logs
WHERE user_id IS NOT NULL AND device_type IS NOT NULL
GROUP BY user_id , device_type
ORDER BY user_id, access_cnt DESC