In [1]:
import faker

from rand_engine.bulk.benchmarks import Benchmark
from rand_engine.bulk.dataframe_builder import BulkRandEngine
from rand_engine.bulk.core_distincts import CoreDistincts
from rand_engine.bulk.core_numeric import CoreNumeric
from rand_engine.bulk.core_datetime import CoreDatetime
from rand_engine.bulk.templates import RandEngineTemplates

from pyspark.sql.functions import date_format, col

from utils import get_spark_session


def gen_bulk_data(spark):

  bulk_rand_engine = BulkRandEngine()
  fake = faker.Faker(locale="pt_BR")
  metadata = {
  "id": dict(method=CoreNumeric.gen_ints, parms=dict(min=0, max=10**7)),
  "name": dict(method=CoreDistincts.gen_distincts_typed, parms=dict(distinct=[fake.name() for _ in range(1000)])),
  "age": dict(method=CoreNumeric.gen_ints, parms=dict(min=0, max=100)),
  "salary": dict(method=CoreNumeric.gen_floats_normal, parms=dict(mean=10**3, std=10**1, round=2)),
  "purchase_date": dict(method=CoreDatetime.gen_timestamps, parms=dict(start="01-01-2020", end="31-12-2020", format="%d-%m-%Y"))
  }

  df = bulk_rand_engine.create_spark_df(spark, 10**2, metadata)
  return df

spark = get_spark_session("ICEBERG_HELLO_WORLD")
df_test = gen_bulk_data(spark)
df_test = (
    df_test
        .withColumn("last_purchase_date", date_format(col("purchase_date"), "yyyy-MM-dd"))
        .withColumnRenamed("purchase_date", "last_purchase"))

df_test.count()
df_test.printSchema()

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-21b63894-4803-45fd-9c1a-b16d0dac49bc;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.1 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.95.0 in central
	found software.amazon.awssdk#bundle;2.17.178 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found software.amazon.awssdk#url-connection-client;2.17.178 in central
	found software.amazon.awssdk#utils;2.17.178 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found softwa

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: double (nullable = true)
 |-- last_purchase: timestamp (nullable = true)
 |-- last_purchase_date: string (nullable = true)



                                                                                

In [2]:
TABLE_NAME = "nessie.dev.table_1"

df_test.writeTo(TABLE_NAME).createOrReplace()
print(f"Numero de linhas após criação: {spark.table(TABLE_NAME).count()}")
df_test.writeTo(TABLE_NAME).append()
print(f"Numero de linhas após append: {spark.table(TABLE_NAME).count()}")

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

Numero de linhas após criação: 100
Numero de linhas após append: 200


In [3]:
spark.sql("""
    SELECT * FROM nessie.dev.table_1.history;
""").show()

# spark.table("nessie.dev.table_1").show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2024-09-29 15:37:...|5565358984773854541|               NULL|              false|
|2024-09-29 15:37:...|5484199323931666385|5565358984773854541|              false|
|2024-09-29 15:38:...| 263071444238144259|               NULL|              false|
|2024-09-29 15:38:...|2128714391020491279| 263071444238144259|              false|
|2024-09-29 16:49:...| 463466068859629968|               NULL|              false|
|2024-09-29 16:49:...| 550167812940096497| 463466068859629968|              false|
|2024-09-29 17:10:...| 718049196851715350|               NULL|              false|
|2024-09-29 17:11:...|5445461876262848214| 718049196851715350|              false|
|2024-09-29 17:26:...|3086337887324965106|               NULL|              false|
|202

# Iceberg Catalogs

### What can be used an Iceberg Catalog

Catalogs help track Iceberg Tables and provide locking mechanisms for ACID Guarantees;
Thing to keep in mind is that while many engines may support Iceberg Tables they may not support connections to all catalogs;

### Project Nessie

- Pros: Git Like functionality, Cloud Managed Service (Arctic);
- Cons: Spport from engines beyond Spark and Dremio;

### Hive Metastore

- Pros: Can use existing Hive Metastore
- Cons: You have to deploy and maintain a hive metastore;

### AWS Glue

- Pros: Interop with AWS Services;
- Cons: Support outside of AWS, Spark and Dremio;


- **Copy-on-write**: Daily batch jobs where write speed is less a priority and read time is a high priority;
- **Merge-on-read (Position Deletes)**: Streaming and higher frequency batch (hourly) where write speed is very important with minor cost to read times. Regular compaction should be scheduled;
- **Merge-on-read (Equality Deletes)**: Very write intensive jobs where position deletes still aren't fast enough, much larger cost to reading so frequent compaction jobs will be necessary to manage;
- **Write-format-default**:
- **Compression Format**:


AnalysisException: [TABLE_OR_VIEW_ALREADY_EXISTS] Cannot create table or view `dev`.`table_2` because it already exists.
Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects.

In [None]:
spark.stop()

24/09/29 21:51:58 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
24/09/29 21:51:58 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce