# Iceberg - Cidades Brasileiras

Este notebook demonstra a criação, leitura e manipulação de uma tabela Iceberg usando o dataset `cidades_brasileiras.csv`.

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("IcebergExample")
    .master("local[*]")
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.4.2")
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.local.type", "hadoop")
    .config("spark.sql.catalog.local.warehouse", "./output/iceberg-warehouse")
    .getOrCreate()
)



25/04/23 23:47:32 WARN Utils: Your hostname, edsatc resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/04/23 23:47:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ed/.cache/pypoetry/virtualenvs/eng-dados-spark-EC3tKTXJ-py3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ed/.ivy2/cache
The jars for the packages stored in: /home/ed/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.3_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-674cd7bd-365a-41c4-bc35-646d1c803938;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.4.2 in central
:: resolution report :: resolve 277ms :: artifacts dl 6ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.4.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spar

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("cidade", StringType(), True),
    StructField("estado", StringType(), True),
    StructField("sigla", StringType(), True),
    StructField("ibge", IntegerType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True)
])

In [3]:
df = spark.read.csv("../data/cidades_brasileiras.csv", header=True, schema=schema)
df.show(5)

                                                                                

+---+-------------------+------------+-----+-------+----------+----------+
| id|             cidade|      estado|sigla|   ibge|  latitude| longitude|
+---+-------------------+------------+-----+-------+----------+----------+
|  1|    ABADIA DE GOIÁS|       GOIÁS|   GO|5200050|-16.757264| -49.44122|
|  2|ABADIA DOS DOURADOS|MINAS GERAIS|   MG|3100104|-18.491063|-47.406365|
|  3|          ABADIÂNIA|       GOIÁS|   GO|5200100|-16.194723|-48.706812|
|  4|             ABAETÉ|MINAS GERAIS|   MG|3100203|-19.156683|-45.448121|
|  5|         ABAETETUBA|        PARÁ|   PA|1500107| -1.721828|-48.878843|
+---+-------------------+------------+-----+-------+----------+----------+
only showing top 5 rows



In [4]:
df.writeTo("local.cidades_iceberg").using("iceberg").createOrReplace()


25/04/23 23:48:00 WARN HadoopTableOperations: Error reading version hint file output/iceberg-warehouse/cidades_iceberg/metadata/version-hint.text
java.io.FileNotFoundException: File output/iceberg-warehouse/cidades_iceberg/metadata/version-hint.text does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:160)
	at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:372)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:976)
	at org.apache.iceberg.hadoop.HadoopTableOperations.findVersion(HadoopTableOperations.java:318)
	at org.apache.iceberg.hadoop

In [7]:
spark.sql('''SELECT * FROM local.cidades_iceberg LIMIT 10;''')


DataFrame[id: int, cidade: string, estado: string, sigla: string, ibge: int, latitude: double, longitude: double]

In [17]:
spark.sql('''SELECT cidade, sigla FROM local.cidades_iceberg WHERE sigla = 'SC';''').show(10)


+----------------+-----+
|          cidade|sigla|
+----------------+-----+
|   ABDON BATISTA|   SC|
|    ABELARDO LUZ|   SC|
|      AGROLÂNDIA|   SC|
|      AGRONÔMICA|   SC|
|       ÁGUA DOCE|   SC|
|ÁGUAS DE CHAPECÓ|   SC|
|     ÁGUAS FRIAS|   SC|
|    ÁGUAS MORNAS|   SC|
|  ALFREDO WAGNER|   SC|
| ALTO BELA VISTA|   SC|
+----------------+-----+
only showing top 10 rows



In [8]:
spark.sql("UPDATE local.cidades_iceberg SET cidade = 'Cidade Atualizada' WHERE id = 9999")

DataFrame[]

In [9]:
spark.sql("DELETE FROM local.cidades_iceberg WHERE id = 9999")

DataFrame[]

In [10]:
df_final = spark.sql("SELECT * FROM local.cidades_iceberg")
df_final.show(5)

+---+-------------------+------------+-----+-------+----------+----------+
| id|             cidade|      estado|sigla|   ibge|  latitude| longitude|
+---+-------------------+------------+-----+-------+----------+----------+
|  1|    ABADIA DE GOIÁS|       GOIÁS|   GO|5200050|-16.757264| -49.44122|
|  2|ABADIA DOS DOURADOS|MINAS GERAIS|   MG|3100104|-18.491063|-47.406365|
|  3|          ABADIÂNIA|       GOIÁS|   GO|5200100|-16.194723|-48.706812|
|  4|             ABAETÉ|MINAS GERAIS|   MG|3100203|-19.156683|-45.448121|
|  5|         ABAETETUBA|        PARÁ|   PA|1500107| -1.721828|-48.878843|
+---+-------------------+------------+-----+-------+----------+----------+
only showing top 5 rows



                                                                                