# 第 4 章: Apache Spark - 3
このノートブックでは **第 4 章: Apache Spark** における、*Spark で Iceberg の機能を利用する* セクションで紹介されているクエリを実行できます。

In [None]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
CATALOG = "my_catalog"
CATALOG_URL = "http://server:8181/"
S3_ENDPOINT = "http://minio:9000"
SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.8.1"

### 1. SparkSession オブジェクトを初期化する

In [None]:
spark = (
    SparkSession.builder
        .config("spark.jars.packages", f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}")
        .config(f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog")
        .config(f"spark.sql.catalog.{CATALOG}.type", "rest")
        .config(f"spark.sql.catalog.{CATALOG}.uri", CATALOG_URL)
        .config(f"spark.sql.catalog.{CATALOG}.s3.endpoint", S3_ENDPOINT)
        .config(f"spark.sql.catalog.{CATALOG}.view-endpoints-supported", "true")
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
        .config("spark.sql.defaultCatalog", "my_catalog")
        .getOrCreate()
)

In [None]:
%sql spark

#### (Optional) データベースの作成
データベースを作成していない場合、以下のセルを実行してください。既にデータベースが存在する場合は、本ステップにつきましてはスキップしてください。

In [None]:
%%sql
CREATE DATABASE IF NOT EXISTS db

## 基本的な機能を利用する
### テーブルの作成

In [None]:
%%sql
CREATE OR REPLACE TABLE db.sales_iceberg (
    product_name string,
    price decimal(10, 2),
    customer_id bigint,
    order_id string,
    datetime timestamp,
    category string) 
USING iceberg

#### テーブルロケーションの設定

In [None]:
%%sql
CREATE TABLE db.sales_iceberg (
    product_name string,
    price decimal(10, 2),
    customer_id bigint,
    order_id string,
    datetime timestamp,
    category string) 
USING iceberg
LOCATION 's3://amzn-s3-demo-bucket/custom-path'

#### テーブルプロパティの設定

In [None]:
%%sql
CREATE TABLE db.sales_iceberg (
    product_name string,
    price decimal(10, 2),
    customer_id bigint,
    order_id string,
    datetime timestamp,
    category string)
USING iceberg
TBLPROPERTIES (
    'write.metadata.compression-codec'='gzip')

#### テーブルパーティションの設定

In [None]:
%%sql
CREATE TABLE db.sales_iceberg (
    product_name string,
    price decimal(10, 2),
    customer_id bigint,
    order_id string,
    datetime timestamp,
    category string)
USING iceberg
PARTITIONED BY (category, year(datetime))

### テーブルデータの読み込み
テーブルレコードの準備: 

In [None]:
%%sql
INSERT INTO db.sales_iceberg VALUES
    ('tomato juice', 2.00, 1698, 'DRE8DLTFNX0MLCE8DLTFNX0MLC', TIMESTAMP '2023-07-18T02:20:58Z', 'drink'),
    ('cocoa', 2.00, 1652, 'DR1UNFHET81UNFHET8', TIMESTAMP '2024-08-26T11:36:48Z', 'drink'),
    ('espresso', 2.00, 1037, 'DRBFZUJWPZ9SRABFZUJWPZ9SRA', TIMESTAMP '2024-04-19T12:17:22Z', 'drink'),
    ('broccoli', 1.00, 3092, 'GRK0L8ZQK0L8ZQ', TIMESTAMP '2023-03-22T18:48:04Z', 'grocery'),
    ('nutmeg', 1.00, 3512, 'GR15U0LKA15U0LKA', TIMESTAMP '2024-02-27T15:13:31Z', 'grocery')

テーブルデータを読む:

In [None]:
%%sql
SELECT * FROM db.sales_iceberg

### データの書き込み
`INSERT INTO` によるデータの追加

In [None]:
%%sql
INSERT INTO db.sales_iceberg VALUES
    ('tomato juice', 2.00, 1698, 'DRE8DLTFNX0MLCE8DLTFNX0MLC', TIMESTAMP '2023-07-18T02:20:58Z', 'drink'),
    ('cocoa', 2.00, 1652, 'DR1UNFHET81UNFHET8', TIMESTAMP '2024-08-26T11:36:48Z', 'drink'),
    ('espresso', 2.00, 1037, 'DRBFZUJWPZ9SRABFZUJWPZ9SRA', TIMESTAMP '2024-04-19T12:17:22Z', 'drink'),
    ('broccoli', 1.00, 3092, 'GRK0L8ZQK0L8ZQ', TIMESTAMP '2023-03-22T18:48:04Z', 'grocery'),
    ('nutmeg', 1.00, 3512, 'GR15U0LKA15U0LKA', TIMESTAMP '2024-02-27T15:13:31Z', 'grocery')

`UPDATE` によるデータの更新

In [None]:
%%sql
UPDATE db.sales_iceberg 
SET product_name = 'white mocha',price = 4.0, datetime = CURRENT_TIMESTAMP
WHERE product_name = 'espresso'

`DELETE` によるデータの削除

In [None]:
%%sql
DELETE FROM db.sales_iceberg WHERE year(datetime) < 2024

## 高度な機能を利用する

### CTAS によるテーブルの新規作成

In [None]:
%%sql
CREATE TABLE db.sales_iceberg_ctas
USING iceberg
AS SELECT * FROM db.sales_iceberg

テーブル作成後、テーブルデータについて確認

In [None]:
%%sql
SELECT * FROM db.sales_iceberg_ctas

### スキーマ進化

スキーマ進化実行前に `sales_iceberg` テーブルのカラムを確認

In [None]:
%%sql
DESCRIBE db.sales_iceberg

In [None]:
%%sql
ALTER TABLE db.sales_iceberg ADD COLUMN description string AFTER product_name

In [None]:
%%sql
ALTER TABLE db.sales_iceberg ADD COLUMN description string

### パーティション進化

In [None]:
%%sql
ALTER TABLE db.sales_iceberg ADD PARTITION FIELD category

### View

In [None]:
%%sql
CREATE OR REPLACE VIEW db.sales_iceberg_analysis_view AS 
SELECT category, sum(price) as total_sales, count(*) as count_by_year, year(datetime) as year
FROM db.sales_iceberg 
GROUP BY category, year

In [None]:
%%sql
SELECT * FROM db.sales_iceberg_analysis_view ORDER BY year DESC, category ASC

### タイムトラベルクエリ

テーブルに新たなデータを追加する

In [None]:
%%sql
INSERT INTO db.sales_iceberg VALUES
    ('broccoli', 1.00, 3092, 'GRK0L8ZQK0L8ZQ', TIMESTAMP '2023-03-22T18:48:04Z', 'grocery'),
    ('nutmeg', 1.00, 3512, 'GR15U0LKA15U0LKA', TIMESTAMP '2024-02-27T15:13:31Z', 'grocery')

現在のテーブルデータを確認する

In [None]:
%%sql
SELECT * FROM db.sales_iceberg

In [None]:
%%sql
SELECT * FROM db.sales_iceberg TIMESTAMP AS OF '<INSERT する前の時間を入力 (例: 2025-03-28 10:00:00)>'

### メタデータテーブルクエリ

In [None]:
%%sql
SELECT * FROM db.sales_iceberg.history

In [None]:
%%sql
SELECT * FROM db.sales_iceberg.snapshots

### MERGE INTO で Upsert 処理を実行する

In [None]:
%%sql
MERGE INTO db.sales_iceberg_w t
USING db.sales_logs s
ON t.order_id = s.order_id
WHEN MATCHED THEN 
    UPDATE SET 
        t.product_name=s.product_name, 
        t.price=s.price, 
        t.datetime=s.datetime
WHEN NOT MATCHED THEN INSERT *

In [None]:
%%sql
SELECT * FROM db.sales_iceberg_w ORDER BY category, product_name

#### WHEN NOT MATCHED BY SOURCE
sales_logs へ新たに 2 レコードを追加します。`mocha` は古いレコーとがたまたま流れてきてしまったことを想定しています。

In [None]:
%%sql
INSERT INTO db.sales_logs VALUES
    ('mocha', 4.00, 1652, 'DR1UNFHET81UNFHET8', TIMESTAMP '2013-11-26T12:49:43Z', 'drink'),
    ('egg', 1.00, 3176, 'GRVQARCD6COVQARCD6CO', TIMESTAMP '2025-02-10 11:15:31', 'grocery');

In [None]:
%%sql
SELECT * FROM db.sales_logs ORDER BY category, product_name

In [None]:
%%sql
MERGE INTO db.sales_iceberg_w t
USING db.sales_logs s
ON t.order_id = s.order_id
WHEN MATCHED AND t.datetime < s.datetime THEN 
    UPDATE SET 
        t.product_name=s.product_name, 
        t.price=s.price, 
        t.datetime=s.datetime
WHEN NOT MATCHED THEN INSERT *
WHEN NOT MATCHED BY SOURCE THEN DELETE

In [None]:
%%sql
SELECT * FROM db.sales_iceberg_w ORDER BY category, product_name