# 第 9 章: Write-Audit-Publish パターンによるデータ品質管理
このノートブックでは **第 10 章: ユースケースとソリューションパターン** における **Write-Audit-Publish パターンによるデータ品質管理** のコネクティッドカーの例を実行できます。

In [None]:
!pip install pydeequ

In [None]:
import os
os.environ['SPARK_VERSION'] = '3.5' # This is required for using Deequ

In [None]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
import pydeequ
CATALOG = "my_catalog"
CATALOG_URL = "http://server:8181/"
S3_ENDPOINT = "http://minio:9000"
SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.8.1"

### 1. SparkSession オブジェクトを初期化する

In [None]:
spark = (
    SparkSession.builder
    .config(
        "spark.jars.packages", 
        f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION},{pydeequ.deequ_maven_coord}")
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .config(f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog")
    .config(f"spark.sql.catalog.{CATALOG}.type", "rest")
    .config(f"spark.sql.catalog.{CATALOG}.uri", CATALOG_URL)
    .config(f"spark.sql.catalog.{CATALOG}.s3.endpoint", S3_ENDPOINT)
    .config(f"spark.sql.catalog.{CATALOG}.view-endpoints-supported", "true")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.defaultCatalog", "my_catalog")
    .getOrCreate()
)

In [None]:
%sql spark

#### (Optional) データベースの作成
データベースを作成していない場合、以下のセルを実行してください。既にデータベースが存在する場合は、本ステップにつきましてはスキップしてください。

In [None]:
%%sql
CREATE DATABASE IF NOT EXISTS db

## 準備: コネクティッドカーデータ用のテーブルを作成する
コネクティッドカーデータ用のテーブルを作成する

In [None]:
%%sql
CREATE TABLE db.conn_cars_telemetry (
    vehicle_id string,           -- 車両ID
    ts timestamp,         -- データ記録時刻
    lat decimal(10,7),     -- 緯度
    lon decimal(10,7),    -- 経度
    speed_kmh decimal(5,1),  -- 速度（km/h）
    acc decimal(4,2), -- 加速度（m/s²）
    bat_soc_percent decimal(5,2), -- バッテリー充電率 (%)
    gps_sigq int      -- GPS信号品質（0-100）
) 
USING iceberg
PARTITIONED BY (days(ts))

正常なデータ (品質検証が完了している)元データを `conn_cars_telemetry` Iceberg テーブルにロードする

In [None]:
import json
src_data_str = json.loads('''[
{"vehicle_id":"abcdev001","ts":"2025-05-31T10:00:00","lat":35.6762,"lon":139.6503,"speed_kmh":0.0,"acc":0.0,"bat_soc_percent":85.5,"gps_sigq":95},
{"vehicle_id":"abcdev001","ts":"2025-05-31T10:00:30","lat":35.6765,"lon":139.6510,"speed_kmh":20.0,"acc":1.1,"bat_soc_percent":85.4,"gps_sigq":93},
{"vehicle_id":"abcdev002","ts":"2025-05-31T10:00:00","lat":35.0116,"lon":135.7681,"speed_kmh":30.0,"acc":0.0,"bat_soc_percent":65.0,"gps_sigq":88},
{"vehicle_id":"abcdev002","ts":"2025-05-31T10:00:30","lat":35.0120,"lon":135.7685,"speed_kmh":40.0,"acc":0.6,"bat_soc_percent":64.9,"gps_sigq":87},
{"vehicle_id":"abcdev003","ts":"2025-05-31T10:00:00","lat":34.6937,"lon":135.5023,"speed_kmh":0.0,"acc":0.0,"bat_soc_percent":98.0,"gps_sigq":100},
{"vehicle_id":"abcdev003","ts":"2025-05-31T10:00:30","lat":34.6937,"lon":135.5023,"speed_kmh":0.0,"acc":0.0,"bat_soc_percent":99.5,"gps_sigq":100},
{"vehicle_id":"abcdev004","ts":"2025-05-31T10:00:00","lat":43.0642,"lon":141.3469,"speed_kmh":80.0,"acc":0.0,"bat_soc_percent":45.5,"gps_sigq":82},
{"vehicle_id":"abcdev004","ts":"2025-05-31T10:00:30","lat":43.0650,"lon":141.3480,"speed_kmh":75.0,"acc":-0.3,"bat_soc_percent":45.4,"gps_sigq":81},
{"vehicle_id":"abcdev005","ts":"2025-05-31T10:00:00","lat":26.2124,"lon":127.6792,"speed_kmh":40.0,"acc":0.0,"bat_soc_percent":72.0,"gps_sigq":96},
{"vehicle_id":"abcdev005","ts":"2025-05-31T10:00:30","lat":26.2130,"lon":127.6800,"speed_kmh":45.0,"acc":0.3,"bat_soc_percent":71.9,"gps_sigq":95},
{"vehicle_id":"abcdev006","ts":"2025-05-31T10:00:00","lat":33.5904,"lon":130.4017,"speed_kmh":60.0,"acc":0.0,"bat_soc_percent":55.0,"gps_sigq":90},
{"vehicle_id":"abcdev006","ts":"2025-05-31T10:00:30","lat":33.5910,"lon":130.4025,"speed_kmh":65.0,"acc":0.3,"bat_soc_percent":54.9,"gps_sigq":89},
{"vehicle_id":"abcdev007","ts":"2025-05-31T10:00:00","lat":38.2682,"lon":140.8694,"speed_kmh":30.0,"acc":0.0,"bat_soc_percent":90.0,"gps_sigq":92},
{"vehicle_id":"abcdev007","ts":"2025-05-31T10:00:30","lat":38.2685,"lon":140.8700,"speed_kmh":35.0,"acc":0.3,"bat_soc_percent":89.9,"gps_sigq":91},
{"vehicle_id":"abcdev008","ts":"2025-05-31T10:00:00","lat":35.4437,"lon":139.6380,"speed_kmh":0.0,"acc":0.0,"bat_soc_percent":25.0,"gps_sigq":98},
{"vehicle_id":"abcdev008","ts":"2025-05-31T10:00:30","lat":35.4437,"lon":139.6380,"speed_kmh":0.0,"acc":0.0,"bat_soc_percent":24.9,"gps_sigq":98},
{"vehicle_id":"abcdev009","ts":"2025-05-31T10:00:00","lat":34.3853,"lon":132.4553,"speed_kmh":90.0,"acc":0.0,"bat_soc_percent":60.0,"gps_sigq":85},
{"vehicle_id":"abcdev009","ts":"2025-05-31T10:00:30","lat":34.3860,"lon":132.4565,"speed_kmh":95.0,"acc":0.3,"bat_soc_percent":59.8,"gps_sigq":84},
{"vehicle_id":"abcdev010","ts":"2025-05-31T10:00:00","lat":36.3219,"lon":139.0032,"speed_kmh":50.0,"acc":0.0,"bat_soc_percent":80.0,"gps_sigq":94},
{"vehicle_id":"abcdev010","ts":"2025-05-31T10:00:30","lat":36.3225,"lon":139.0040,"speed_kmh":55.0,"acc":0.3,"bat_soc_percent":79.9,"gps_sigq":93},
{"vehicle_id":"abcdev011","ts":"2025-05-31T10:00:00","lat":35.1815,"lon":136.9066,"speed_kmh":40.0,"acc":0.0,"bat_soc_percent":50.0,"gps_sigq":88},
{"vehicle_id":"abcdev011","ts":"2025-05-31T10:00:30","lat":35.1820,"lon":136.9072,"speed_kmh":45.0,"acc":0.3,"bat_soc_percent":49.9,"gps_sigq":87},
{"vehicle_id":"abcdev012","ts":"2025-05-31T10:00:00","lat":31.5966,"lon":130.5571,"speed_kmh":70.0,"acc":0.0,"bat_soc_percent":35.0,"gps_sigq":91},
{"vehicle_id":"abcdev012","ts":"2025-05-31T10:00:30","lat":31.5975,"lon":130.5580,"speed_kmh":75.0,"acc":0.3,"bat_soc_percent":34.8,"gps_sigq":90}
]''')

# ts カラムの値を datetime に変更する
from datetime import datetime as dt
src_data = list(map(lambda r: {**r, 'ts': dt.fromisoformat(r['ts'])}, src_data_str))

In [None]:
# JSON データから Spark DataFrame を作成し、Iceberg テーブルに書き込む
df = spark.createDataFrame(data=src_data)
df.writeTo('db.conn_cars_telemetry').append()

Iceberg テーブルデータを確認する

In [None]:
%%sql
SELECT * FROM db.conn_cars_telemetry LIMIT 30

## Iceberg ブランチを作成する

In [None]:
%%sql
ALTER TABLE db.conn_cars_telemetry CREATE BRANCH stg

In [None]:
%%sql
ALTER TABLE db.conn_cars_telemetry CREATE BRANCH audit

### (Optional) 作成したブランチ一覧を確認する
Iceberg メタデータテーブル `refs` に対してクエリを実行することで、現在作成されているブランチ一覧を確認することができます。

In [None]:
%%sql
SELECT * FROM db.conn_cars_telemetry.refs

---

## コネクティッドカーデータに対して Iceberg ブランチを用いて WAP パターンを適用する
### Write フェーズ - 追加のレコードを `stg`　ブランチに書き込む

In [None]:
new_data_str = json.loads('''[
{"vehicle_id":"abcdev001","ts":"2025-05-31T10:01:00","lat":35.6770,"lon":139.6518,"speed_kmh":35.0,"acc":0.8,"bat_soc_percent":85.3,"gps_sigq":94},
{"vehicle_id":"abcdev001","ts":"2025-05-31T10:01:30","lat":null,"lon":null,"speed_kmh":40.0,"acc":0.3,"bat_soc_percent":85.2,"gps_sigq":0},
{"vehicle_id":"abcdev001","ts":"2025-05-31T10:02:00","lat":null,"lon":null,"speed_kmh":45.0,"acc":0.3,"bat_soc_percent":85.1,"gps_sigq":0},
{"vehicle_id":"abcdev001","ts":"2025-05-31T10:02:30","lat":35.6785,"lon":139.6540,"speed_kmh":50.0,"acc":0.3,"bat_soc_percent":85.0,"gps_sigq":91},
{"vehicle_id":"abcdev002","ts":"2025-05-31T10:01:00","lat":35.0125,"lon":135.7690,"speed_kmh":150.0,"acc":25.0,"bat_soc_percent":64.8,"gps_sigq":88},
{"vehicle_id":"abcdev002","ts":"2025-05-31T10:01:30","lat":35.0130,"lon":135.7695,"speed_kmh":60.0,"acc":-5.0,"bat_soc_percent":64.7,"gps_sigq":89},
{"vehicle_id":"abcdev003","ts":"2025-05-31T10:01:00","lat":34.6937,"lon":135.5023,"speed_kmh":0.0,"acc":0.0,"bat_soc_percent":125.5,"gps_sigq":100},
{"vehicle_id":"abcdev003","ts":"2025-05-31T10:01:30","lat":34.6937,"lon":135.5023,"speed_kmh":0.0,"acc":0.0,"bat_soc_percent":100.0,"gps_sigq":100},
{"vehicle_id":"abcdev004","ts":"2025-05-31T10:01:00","lat":null,"lon":null,"speed_kmh":80.0,"acc":-30.0,"bat_soc_percent":45.3,"gps_sigq":5},
{"vehicle_id":"abcdev004","ts":"2025-05-31T10:01:30","lat":null,"lon":null,"speed_kmh":70.0,"acc":-0.6,"bat_soc_percent":45.2,"gps_sigq":8},
{"vehicle_id":"abcdev005","ts":"2025-05-31T10:01:00","lat":26.2135,"lon":127.6808,"speed_kmh":50.0,"acc":0.3,"bat_soc_percent":71.8,"gps_sigq":94},
{"vehicle_id":"abcdev005","ts":"2025-05-31T10:01:30","lat":26.2140,"lon":127.6815,"speed_kmh":55.0,"acc":0.3,"bat_soc_percent":-15.5,"gps_sigq":93},
{"vehicle_id":"abcdev006","ts":"2025-05-31T10:01:00","lat":33.5915,"lon":130.4032,"speed_kmh":70.0,"acc":0.3,"bat_soc_percent":54.8,"gps_sigq":88},
{"vehicle_id":"abcdev006","ts":"2025-05-31T10:01:30","lat":33.5920,"lon":130.4040,"speed_kmh":65.0,"acc":-0.3,"bat_soc_percent":54.7,"gps_sigq":87},
{"vehicle_id":"abcdev007","ts":"2025-05-31T10:01:00","lat":null,"lon":null,"speed_kmh":40.0,"acc":0.3,"bat_soc_percent":89.8,"gps_sigq":0},
{"vehicle_id":"abcdev007","ts":"2025-05-31T10:01:30","lat":38.2692,"lon":140.8712,"speed_kmh":45.0,"acc":0.3,"bat_soc_percent":89.7,"gps_sigq":90},
{"vehicle_id":"abcdev008","ts":"2025-05-31T10:01:00","lat":35.4440,"lon":139.6385,"speed_kmh":15.0,"acc":0.8,"bat_soc_percent":24.8,"gps_sigq":97},
{"vehicle_id":"abcdev008","ts":"2025-05-31T10:01:30","lat":35.4445,"lon":139.6390,"speed_kmh":25.0,"acc":0.6,"bat_soc_percent":24.7,"gps_sigq":96},
{"vehicle_id":"abcdev009","ts":"2025-05-31T10:01:00","lat":34.3868,"lon":132.4578,"speed_kmh":100.0,"acc":0.3,"bat_soc_percent":59.6,"gps_sigq":83},
{"vehicle_id":"abcdev009","ts":"2025-05-31T10:01:30","lat":34.3875,"lon":null,"speed_kmh":95.0,"acc":-0.3,"bat_soc_percent":59.4,"gps_sigq":82},
{"vehicle_id":"abcdev010","ts":"2025-05-31T10:01:00","lat":36.3230,"lon":139.0048,"speed_kmh":60.0,"acc":0.3,"bat_soc_percent":79.8,"gps_sigq":92},
{"vehicle_id":"abcdev010","ts":"2025-05-31T10:01:30","lat":36.3235,"lon":null,"speed_kmh":58.0,"acc":-0.1,"bat_soc_percent":79.7,"gps_sigq":91},
{"vehicle_id":"abcdev011","ts":"2025-05-31T10:01:00","lat":35.1825,"lon":136.9078,"speed_kmh":42.0,"acc":-0.2,"bat_soc_percent":49.8,"gps_sigq":86},
{"vehicle_id":"abcdev011","ts":"2025-05-31T10:01:30","lat":35.1830,"lon":136.9084,"speed_kmh":38.0,"acc":-0.2,"bat_soc_percent":49.7,"gps_sigq":85},
{"vehicle_id":"abcdev012","ts":"2025-05-31T10:01:00","lat":31.5983,"lon":130.5590,"speed_kmh":80.0,"acc":0.3,"bat_soc_percent":34.6,"gps_sigq":89},
{"vehicle_id":"abcdev012","ts":"2025-05-31T10:01:30","lat":31.5992,"lon":130.5600,"speed_kmh":78.0,"acc":-0.1,"bat_soc_percent":34.4,"gps_sigq":88}
]''')

# ts カラムの値を datetime に変更する
new_data = list(map(lambda r: {**r, 'ts': dt.fromisoformat(r['ts'])}, new_data_str))

「異常値を含む」データを Iceberg テーブルに書き込む。この時 `stg` ブランチに対してデータを書き込む。

In [None]:
df_new = spark.createDataFrame(data=new_data)
df_new.writeTo('db.conn_cars_telemetry.branch_stg').append()

`main` ブランチを参照するこのテーブルデータを参照するコンシューマーからは、書き込まれたデータが見えないことを確認します。以下の集計クエリを実行すると全部で `24` レコードが出力され、書き込む前のレコード数と同じことを確認できます。また、`ts` カラムを確認すると、直前に追加した `2025-05-31T10:01:00` 以降のデータが出力に含まれていないことも確認できます。

In [None]:
%%sql
SELECT count(*) as record_cnt FROM db.conn_cars_telemetry WHERE ts >= '2025-05-31T10:01:00'

## Audit フェーズ - `stg` ブランチにおけるデータ品質を検証する

In [None]:
from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationSuite, VerificationResult

def run_quality_checks(df):
    """
    以下の項目についてデータ品質を確認する
    1. Battery soc percent (bat_soc_percent) が 0 - 100 の間である
    2. Acceralation (m/s^2) (acc) が -15 - 10 の間である
    3. GPS 信号の座標 (lat/lon)について、どちらも NULL か、どちらも NULL でない
    4. GPS 信号の座標 (lat/lon)について、どちらも NULL の場合は、GPS 信号品質 (gps_sigq) が 20 以下である
    """
    check = Check(spark, CheckLevel.Error, "Connected Car Data Quality Check") \
        .hasMin("bat_soc_percent", lambda x: x >= 0.0) \
        .hasMax("bat_soc_percent", lambda x: x <= 100.0) \
        .hasMin("acc", lambda x: x >= -15.0) \
        .hasMax("acc", lambda x: x <= 10.0) \
        .satisfies(
            "lat IS NULL AND lon IS NULL OR lat IS NOT NULL AND lon IS NOT NULL",
            "GPS coordinates must be both NULL or both non-NULL") \
        .satisfies(
            "gps_sigq >= 20 OR (lat IS NULL AND lon IS NULL)",
            "Low GPS signal quality should result in NULL coordinates")
    
    return VerificationSuite(spark).onData(df).addCheck(check).run()

`stg` ブランチのおけるデータについてその品質を検証する

In [None]:
df_stg = spark.sql("SELECT * FROM db.conn_cars_telemetry.branch_stg")
result = run_quality_checks(df_stg)

In [None]:
VerificationResult.checkResultsAsDataFrame(spark, result).createOrReplaceTempView("result")

In [None]:
%%sql
SELECT constraint, constraint_status, constraint_message FROM result

データ品質の検証に失敗したデータを今回は取り除き、`audit` ブランチにデータを書き込む

In [None]:
df_cleaned = spark.sql("""
SELECT * FROM db.conn_cars_telemetry.branch_stg
WHERE 
    ((lat IS NULL AND lon IS NULL) OR (lat IS NOT NULL AND lon IS NOT NULL)) AND
    (acc BETWEEN -15.0 AND 10.0) AND
    (bat_soc_percent BETWEEN 0.0 AND 100.0)
""")

In [None]:
df_cleaned.writeTo("db.conn_cars_telemetry.branch_audit").overwritePartitions()

(Optional) `stg` における異常値については、以下の SQL で確認できます

In [None]:
%%sql
SELECT * FROM db.conn_cars_telemetry.branch_stg
WHERE 
    ((lat IS NULL AND lon IS NOT NULL) OR (lat IS NOT NULL AND lon IS NULL)) OR
    (acc NOT BETWEEN -15.0 AND 10.0) OR
    (bat_soc_percent NOT BETWEEN 0.0 AND 100.0)

## Publish フェーズ - `audit` における最新のスナップショットを `main` にパブリッシュする

In [None]:
%%sql
CALL system.fast_forward(table => 'db.conn_cars_telemetry', branch => 'main', to => 'audit')

最後に、`main` におけるデータに対してデータ品質の検証を行う。全てが `SUCCESS` となることを確認する

In [None]:
result = run_quality_checks(spark.sql("SELECT * FROM db.conn_cars_telemetry"))
VerificationResult.checkResultsAsDataFrame(spark, result).createOrReplaceTempView("result_clean")

In [None]:
%%sql
SELECT constraint, constraint_status, constraint_message FROM result_clean