In [None]:
import pandas as pd
import pyarrow as pa

from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import (
    NestedField,
    StringType,
    DoubleType,
    IntegerType
)

# 1. Read CSV
df = pd.read_csv("Sample-Superstore.csv", encoding="latin-1")
table_arrow = pa.Table.from_pandas(df)

In [36]:
# 2. Load Iceberg catalog (Hive)
catalog = load_catalog(
    name="hive",
    **{
        "type": "hive",
        "uri": "thrift://hive-metastore:9083",
        "warehouse": "s3a://lakehouse/bronze",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key-id": "minio",
        "s3.secret-access-key": "minio123",
        "s3.path-style-access": "true",
        "s3.region": "us-east-1",
    }
)


In [22]:
# 3. Define Iceberg schema
schema = Schema(
    NestedField(1, "Row ID", IntegerType(), required=False),
    NestedField(2, "Order ID", StringType(), required=False),
    NestedField(3, "Sales", DoubleType(), required=False),
)

# Namespace & table
namespace = "bronze"
table_name = "superstore"
identifier = f"{namespace}.{table_name}"

In [30]:
namespace = "bronze"
table_name = "superstore"
identifier = f"{namespace}.{table_name}"

# 1. Create namespace jika belum ada
namespaces = [ns[0] for ns in catalog.list_namespaces()]
if namespace not in namespaces:
    catalog.create_namespace(namespace)

# 2. Create table jika belum ada
if not catalog.table_exists(identifier):
    catalog.create_table(
        identifier=identifier,
        schema=schema,
        location="s3a://lakehouse/bronze/superstore"
    )

# 3. Load table
table = catalog.load_table(identifier)


In [34]:
# print(table.schema())
# # print(table_arrow.schema)
df = table
print(df)

superstore(
  1: Row ID: optional int,
  2: Order ID: optional string,
  3: Sales: optional double
),
partition by: [],
sort order: [],
snapshot: null


In [31]:
# 5. Append data
table.append(table_arrow)

print("✅ Data successfully written to Iceberg table in MinIO")

ValueError: PyArrow table contains more columns: Category, City, Country, Customer ID, Customer Name, Discount, Order Date, Postal Code, Product ID, Product Name, Profit, Quantity, Region, Segment, Ship Date, Ship Mode, State, Sub-Category. Update the schema first (hint, use union_by_name).