In [1]:
import pyspark
from pyspark.sql import SparkSession
import os

AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
S3_ENDPOINT = os.getenv("S3_ENDPOINT")
NESSIE_URI = os.getenv("NESSIE_URI")

MASTER = "spark://spark-master:7077"

jar_packages = [
    "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1",
    "org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.99.0",
    #"software.amazon.awssdk:bundle:2.28.13",
    #"software.amazon.awssdk:url-connection-client:2.28.13",
    "org.apache.iceberg:iceberg-aws-bundle:1.6.1"
  ]

spark_extensions = [
    "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    "org.projectnessie.spark.extensions.NessieSparkSessionExtensions"
]
conf = (
    pyspark.SparkConf()
    .setAppName("Iceberg")
    .set("spark.master", MASTER)
    .set("spark.jars.packages", ','.join(jar_packages))
    .set("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions")
    .set("spark.executor.memory", "2g")
    .set("spark.executor.cores", "2")
    .set("spark.executor.instances", "1")
    .set("spark.driver.memory", "2g")
    .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.nessie.s3.path-style-access", "true")
    .set("spark.sql.catalog.nessie.s3.endpoint", S3_ENDPOINT)
    .set("spark.sql.catalog.nessie.warehouse", "s3a://bronze/")
    .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .set("spark.sql.catalog.nessie.uri", NESSIE_URI)
    .set("spark.sql.catalog.nessie.ref", "main")
    .set("spark.sql.catalog.nessie.authentication.type", "NONE")
    .set("spark.sql.catalog.nessie.cache-enabled", "false")
    .set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
    .set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
    .set("spark.hadoop.fs.s3a.endpoint", S3_ENDPOINT)
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
print(spark)

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-aws-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-79b2424d-720b-4544-9352-44c00c165c10;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.1 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.99.0 in central
	found org.apache.iceberg#iceberg-aws-bundle;1.6.1 in central
:: resolution report :: resolve 221ms :: artifacts dl 8ms
	:: modules in use:
	org.apache.iceberg#iceberg-aws-bundle;1.6.1 from central in [default]
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.1 from central in [default]
	org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.99.0 from cent

<pyspark.sql.session.SparkSession object at 0x7f664f52b380>


In [45]:
spark.stop()

In [2]:
spark.sql("SHOW CATALOGS").show()

+-------------+
|      catalog|
+-------------+
|spark_catalog|
+-------------+



In [3]:
# Testando leitura do arquivo
df = spark.read.csv("s3a://landing/marca_carro.csv", header=True, inferSchema=True)
df.show()

[Stage 2:>                                                          (0 + 1) / 1]

+------------+---------+
| marca_carro|cod_marca|
+------------+---------+
|       Acura|        1|
|Aston Martin|        2|
|        Audi|        3|
|      Austin|        4|
|         BMW|        5|
|     Bentley|        6|
|     Bugatti|        7|
|       Buick|        8|
|    Cadillac|        9|
|   Chevrolet|       10|
|    Chrysler|       11|
|     Citroën|       12|
|      Daewoo|       13|
|       Dodge|       14|
|       Eagle|       15|
|     Ferrari|       16|
|        Ford|       17|
|         GMC|       18|
|         Geo|       19|
|       Honda|       20|
+------------+---------+
only showing top 20 rows



                                                                                

In [4]:
spark.sql("""
CREATE NAMESPACE IF NOT EXISTS nessie.bronze
""")


DataFrame[]

In [5]:
spark.sql("""
CREATE TABLE IF NOT EXISTS nessie.bronze.carros (
    marca_carro STRING,
    cod_marca INT
)
USING iceberg
LOCATION 's3a://bronze/marca_carro/'
""")


DataFrame[]

In [10]:
spark.stop()

In [6]:
spark.sql("SHOW CATALOGS").show()

+-------------+
|      catalog|
+-------------+
|       nessie|
|spark_catalog|
+-------------+



In [7]:
spark.sql("SHOW TABLES IN nessie.bronze").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|   bronze|   carros|      false|
+---------+---------+-----------+



In [8]:
df.write.format("iceberg").mode("overwrite").save("nessie.bronze.carros")

                                                                                

In [10]:
spark.sql("SELECT * FROM nessie.bronze.carros").show()

[Stage 4:>                                                          (0 + 1) / 1]

+------------+---------+
| marca_carro|cod_marca|
+------------+---------+
|       Acura|        1|
|Aston Martin|        2|
|        Audi|        3|
|      Austin|        4|
|         BMW|        5|
|     Bentley|        6|
|     Bugatti|        7|
|       Buick|        8|
|    Cadillac|        9|
|   Chevrolet|       10|
|    Chrysler|       11|
|     Citroën|       12|
|      Daewoo|       13|
|       Dodge|       14|
|       Eagle|       15|
|     Ferrari|       16|
|        Ford|       17|
|         GMC|       18|
|         Geo|       19|
|       Honda|       20|
+------------+---------+
only showing top 20 rows



                                                                                