In [3]:
# Librerías generales
#=========================
import json
from json import dumps
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, explode, from_json, col, current_date
from pyspark.sql.types import StringType, StructType, StructField, ArrayType
import datetime 
import requests
import urllib3
from datetime import datetime,date
import pytz
import io
import glob
import os
import boto3

In [4]:
CATALOG_URI = "http://nessie:19120/api/v1"  # Nessie Server URI
WAREHOUSE = "s3://gold/"               # Minio Address to Write to
STORAGE_URI = "http://172.20.0.5:9000"      # Minio IP address from docker inspect
AWS_ACCESS_KEY='admin'
AWS_SECRET_KEY='password'


In [5]:
# Configuración de Minio
minio_client = boto3.client(
    's3',
    #endpoint_url='http://172.18.0.4:9000',  # Usar el nombre del servicio de Docker
    endpoint_url='http://minio:9000',
    aws_access_key_id='admin',
    aws_secret_access_key='password',
    region_name='us-east-1'
)

In [6]:
from pyspark.sql import SparkSession
import pyspark

# Configuración combinada
conf = (
    pyspark.SparkConf()
        .setAppName('combined_spark_app')
        # Paquetes para PostgreSQL, Iceberg, Nessie, AWS SDK y Hadoop AWS
        .set('spark.jars.packages', 'org.postgresql:postgresql:42.7.3,'
                                    'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,'
                                    'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,'
                                    'software.amazon.awssdk:bundle:2.24.8,'
                                    'software.amazon.awssdk:url-connection-client:2.24.8,'
                                    'org.apache.hadoop:hadoop-aws:3.2.0,'
                                    'com.amazonaws:aws-java-sdk-bundle:1.11.534')
        # Extensiones de Iceberg y Nessie
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
                                     'org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        # Configuración de Nessie como catálogo Iceberg
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', CATALOG_URI)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        # Configuración para almacenamiento en S3 (MinIO)
        .set('spark.sql.catalog.nessie.s3.endpoint', STORAGE_URI)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
        # Configuración para acceso a S3 directamente desde Spark (sin Nessie)
        .set('spark.hadoop.fs.s3a.endpoint', STORAGE_URI)
        .set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY)
        .set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY)
        .set('spark.hadoop.fs.s3a.path.style.access', 'true')
        
)



In [7]:
# Creación de la sesión de Spark
spark = SparkSession.builder \
    .config(conf=conf) \
    .getOrCreate()
print("Spark Session Started")

Spark Session Started


In [8]:
#!/usr/bin/env python

# make sure to install these packages before running:
# pip install pandas
!pip install sodapy
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("www.datos.gov.co", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(www.datos.gov.co,
#                  MyAppToken,
#                  username="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("vafm-j2df", limit=2000)

# Convert to pandas DataFrame
df_divi = pd.DataFrame.from_records(results)





In [9]:
df_divi.head()

Unnamed: 0,cod_dpto,nom_dpto,cod_mpio,nom_mpio,tipo,latitud,longitud,geo_municipio
0,5,ANTIOQUIA,5001,MEDELLÍN,Municipio,6.257590259,-75.61103107,"{'type': 'Point', 'coordinates': [-75.61103107..."
1,5,ANTIOQUIA,5002,ABEJORRAL,Municipio,5.803728154,-75.43847353,"{'type': 'Point', 'coordinates': [-75.43847353..."
2,5,ANTIOQUIA,5004,ABRIAQUÍ,Municipio,6.627569378,-76.08597756,"{'type': 'Point', 'coordinates': [-76.08597756..."
3,5,ANTIOQUIA,5021,ALEJANDRÍA,Municipio,6.365534125,-75.09059702,"{'type': 'Point', 'coordinates': [-75.09059702..."
4,5,ANTIOQUIA,5030,AMAGÁ,Municipio,6.032921994,-75.7080031,"{'type': 'Point', 'coordinates': [-75.7080031,..."


In [10]:
df_divi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1121 entries, 0 to 1120
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   cod_dpto       1121 non-null   object
 1   nom_dpto       1121 non-null   object
 2   cod_mpio       1121 non-null   object
 3   nom_mpio       1121 non-null   object
 4   tipo           1121 non-null   object
 5   latitud        1121 non-null   object
 6   longitud       1121 non-null   object
 7   geo_municipio  1121 non-null   object
dtypes: object(8)
memory usage: 70.2+ KB


In [13]:
df_divi_sparkDF=spark.createDataFrame(df_divi)


In [14]:
df_divi_sparkDF.show(7)

+--------+---------+--------+----------+---------+-----------+------------+--------------------+
|cod_dpto| nom_dpto|cod_mpio|  nom_mpio|     tipo|    latitud|    longitud|       geo_municipio|
+--------+---------+--------+----------+---------+-----------+------------+--------------------+
|       5|ANTIOQUIA|    5001|  MEDELLÍN|Municipio|6.257590259|-75.61103107|{coordinates -> [...|
|       5|ANTIOQUIA|    5002| ABEJORRAL|Municipio|5.803728154|-75.43847353|{coordinates -> [...|
|       5|ANTIOQUIA|    5004|  ABRIAQUÍ|Municipio|6.627569378|-76.08597756|{coordinates -> [...|
|       5|ANTIOQUIA|    5021|ALEJANDRÍA|Municipio|6.365534125|-75.09059702|{coordinates -> [...|
|       5|ANTIOQUIA|    5030|     AMAGÁ|Municipio|6.032921994| -75.7080031|{coordinates -> [...|
|       5|ANTIOQUIA|    5031|    AMALFI|Municipio|6.977788843| -74.9812393|{coordinates -> [...|
|       5|ANTIOQUIA|    5034|     ANDES|Municipio|5.604993248|-75.94128391|{coordinates -> [...|
+--------+---------+--------+-

In [15]:

# Eliminar espacios en los nombres de las columnas y convertirlas a mayúsculas
df_divi_sparkDF = df_divi_sparkDF.toDF(*[c.strip().replace(' ', '_').upper() for c in df_divi_sparkDF.columns])

# Eliminar la columna 'GEO_MUNICIPIO'
df_divi_sparkDF = df_divi_sparkDF.drop('GEO_MUNICIPIO')

# Convertir LATITUD y LONGITUD a tipo float
df_divi_sparkDF = df_divi_sparkDF.withColumn('LATITUD', col('LATITUD').cast('double'))
df_divi_sparkDF = df_divi_sparkDF.withColumn('LONGITUD', col('LONGITUD').cast('double'))

# Filtrar las filas donde NOM_DPTO sea 'VALLE DEL CAUCA'
df_divi_sparkDF = df_divi_sparkDF.filter(col('NOM_DPTO') == 'VALLE DEL CAUCA')

# Eliminar la columna 'TIPO'
df_divi_sparkDF = df_divi_sparkDF.drop('TIPO')

In [16]:
df_divi_sparkDF.show(5)

+--------+---------------+--------+------------+-----------+------------+
|COD_DPTO|       NOM_DPTO|COD_MPIO|    NOM_MPIO|    LATITUD|    LONGITUD|
+--------+---------------+--------+------------+-----------+------------+
|      76|VALLE DEL CAUCA|   76001|        CALI|3.399043723|-76.57649259|
|      76|VALLE DEL CAUCA|   76020|      ALCALÁ|  4.6788971|-75.78297932|
|      76|VALLE DEL CAUCA|   76036|   ANDALUCÍA|4.153314228|-76.16063341|
|      76|VALLE DEL CAUCA|   76041|ANSERMANUEVO|4.795927292|-76.02963049|
|      76|VALLE DEL CAUCA|   76054|     ARGELIA|4.704287864|-76.14164999|
+--------+---------------+--------+------------+-----------+------------+
only showing top 5 rows



In [17]:

import urllib3
from io import StringIO

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url_instalaciones = "https://raw.githubusercontent.com/ouzka/webinar-uao/refs/heads/main/Data_EDA.csv"
response = requests.get(url_instalaciones)
print(response)
if response.status_code == 200:
    csv_data = StringIO(response.text)
    df_inst = pd.read_csv(csv_data,encoding= 'utf-8')

<Response [200]>


In [18]:
df_inst.head()

Unnamed: 0,DEPARTAMENTO,CIUDAD,SERIAL,PRODUCTO,FECHA INSTALACION,FECHA TERMINACION,ESTADO
0,VALLE DEL CAUCA,PALMIRA,3002071.0,Producto-Oro,5/03/2020,,Con servicio
1,TOLIMA,IBAGUÉ,3194094.0,Producto-Plata,15/08/2023,,Con servicio
2,VALLE DEL CAUCA,PALMIRA,3021114.0,Producto-Plata,26/02/2024,,Con servicio
3,VALLE DEL CAUCA,PALMIRA,3001881.0,Producto-Bronce,12/05/2022,,Con servicio
4,VALLE DEL CAUCA,GUADALAJARA DE BUGA,3262138.0,Producto-Plata,16/05/2024,,Con servicio


In [19]:
df_inst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293 entries, 0 to 292
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   DEPARTAMENTO       293 non-null    object 
 1   CIUDAD             293 non-null    object 
 2   SERIAL             291 non-null    float64
 3   PRODUCTO           293 non-null    object 
 4   FECHA INSTALACION  293 non-null    object 
 5   FECHA TERMINACION  20 non-null     object 
 6   ESTADO             293 non-null    object 
dtypes: float64(1), object(6)
memory usage: 16.2+ KB


In [20]:
df_inst_sparkDF=spark.createDataFrame(df_inst)


In [21]:
df_inst_sparkDF.show(5)

+---------------+-------------------+---------+---------------+-----------------+-----------------+------------+
|   DEPARTAMENTO|             CIUDAD|   SERIAL|       PRODUCTO|FECHA INSTALACION|FECHA TERMINACION|      ESTADO|
+---------------+-------------------+---------+---------------+-----------------+-----------------+------------+
|VALLE DEL CAUCA|            PALMIRA|3002071.0|   Producto-Oro|        5/03/2020|              NaN|Con servicio|
|         TOLIMA|             IBAGUÉ|3194094.0| Producto-Plata|       15/08/2023|              NaN|Con servicio|
|VALLE DEL CAUCA|            PALMIRA|3021114.0| Producto-Plata|       26/02/2024|              NaN|Con servicio|
|VALLE DEL CAUCA|            PALMIRA|3001881.0|Producto-Bronce|       12/05/2022|              NaN|Con servicio|
|VALLE DEL CAUCA|GUADALAJARA DE BUGA|3262138.0| Producto-Plata|       16/05/2024|              NaN|Con servicio|
+---------------+-------------------+---------+---------------+-----------------+---------------

In [22]:
from pyspark.sql.functions import col, when, to_date, date_format

# Normalizar nombres de columnas
df_inst_sparkDF = df_inst_sparkDF.toDF(*[c.strip().replace(' ', '_').upper() for c in df_inst_sparkDF.columns])

# Reformatear fechas para asegurarse de que tengan dos dígitos en día y mes antes de la conversión
df_inst_sparkDF = df_inst_sparkDF.withColumn("FECHA_INSTALACION", 
    to_date(date_format(col("FECHA_INSTALACION"), "dd/MM/yyyy"), "dd/MM/yyyy"))

df_inst_sparkDF = df_inst_sparkDF.withColumn("FECHA_TERMINACION", 
    to_date(date_format(col("FECHA_TERMINACION"), "dd/MM/yyyy"), "dd/MM/yyyy"))

# Reemplazar valores nulos en PRODUCTO
df_inst_sparkDF = df_inst_sparkDF.withColumn('PRODUCTO', when(col('PRODUCTO').isNull(), 'NO_ESPECIFICA').otherwise(col('PRODUCTO')))

# Reemplazar valores nulos en SERIAL y convertir a entero
df_inst_sparkDF = df_inst_sparkDF.withColumn('SERIAL', when(col('SERIAL').isNull(), 0).otherwise(col('SERIAL').cast('int')))

# Filtrar por DEPARTAMENTO = 'VALLE DEL CAUCA'
df_inst_sparkDF = df_inst_sparkDF.filter(col('DEPARTAMENTO') == 'VALLE DEL CAUCA')

# Mostrar resultados
df_inst_sparkDF.show(10)


+---------------+-------------------+-------+---------------+-----------------+-----------------+------------+
|   DEPARTAMENTO|             CIUDAD| SERIAL|       PRODUCTO|FECHA_INSTALACION|FECHA_TERMINACION|      ESTADO|
+---------------+-------------------+-------+---------------+-----------------+-----------------+------------+
|VALLE DEL CAUCA|            PALMIRA|3002071|   Producto-Oro|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|            PALMIRA|3021114| Producto-Plata|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|            PALMIRA|3001881|Producto-Bronce|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|GUADALAJARA DE BUGA|3262138| Producto-Plata|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|         EL CERRITO|3105407|Producto-Bronce|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|            PALMIRA|3012492|   Producto-Oro|             NULL|             NULL|Con servicio|
|

In [23]:
df_inst_sparkDF.show(10)

+---------------+-------------------+-------+---------------+-----------------+-----------------+------------+
|   DEPARTAMENTO|             CIUDAD| SERIAL|       PRODUCTO|FECHA_INSTALACION|FECHA_TERMINACION|      ESTADO|
+---------------+-------------------+-------+---------------+-----------------+-----------------+------------+
|VALLE DEL CAUCA|            PALMIRA|3002071|   Producto-Oro|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|            PALMIRA|3021114| Producto-Plata|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|            PALMIRA|3001881|Producto-Bronce|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|GUADALAJARA DE BUGA|3262138| Producto-Plata|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|         EL CERRITO|3105407|Producto-Bronce|             NULL|             NULL|Con servicio|
|VALLE DEL CAUCA|            PALMIRA|3012492|   Producto-Oro|             NULL|             NULL|Con servicio|
|

In [24]:
from pyspark.sql.functions import col

# Realizar la unión entre df_inst y df_divi
df_merge_sparkDF = df_inst_sparkDF.join(df_divi_sparkDF.select('NOM_MPIO', 'LATITUD', 'LONGITUD'), 
                        df_inst_sparkDF.CIUDAD == df_divi_sparkDF.NOM_MPIO, 
                        how='inner')

# Eliminar la columna 'NOM_MPIO'
df_merge_sparkDF = df_merge_sparkDF.drop('NOM_MPIO')


In [25]:
df_merge_sparkDF.show(6)

+---------------+------------+-------+--------------+-----------------+-----------------+------------+-----------+------------+
|   DEPARTAMENTO|      CIUDAD| SERIAL|      PRODUCTO|FECHA_INSTALACION|FECHA_TERMINACION|      ESTADO|    LATITUD|    LONGITUD|
+---------------+------------+-------+--------------+-----------------+-----------------+------------+-----------+------------+
|VALLE DEL CAUCA|     BOLÍVAR|3304183|  Producto-Oro|             NULL|             NULL|Con servicio|4.393118861|-76.34982039|
|VALLE DEL CAUCA|BUENAVENTURA|3267784|Producto-Plata|             NULL|             NULL|Con servicio|3.493340766|-77.11872832|
|VALLE DEL CAUCA|BUGALAGRANDE|3345424|Producto-Plata|             NULL|             NULL|Con servicio|4.196852991| -76.0896103|
|VALLE DEL CAUCA|  CANDELARIA|3326217|  Producto-Oro|             NULL|             NULL|Con servicio|3.382091564|-76.38317663|
|VALLE DEL CAUCA|  CANDELARIA|3092356|  Producto-Oro|             NULL|             NULL|Con servicio|3.

In [26]:
from pyspark.sql.functions import current_date

# Agregar la columna 'f_actual' con la fecha actual
df_inst_divi_sparkDF = df_merge_sparkDF.withColumn('f_actual', current_date())

In [27]:
df_inst_divi_sparkDF.show(3)

+---------------+------------+-------+--------------+-----------------+-----------------+------------+-----------+------------+----------+
|   DEPARTAMENTO|      CIUDAD| SERIAL|      PRODUCTO|FECHA_INSTALACION|FECHA_TERMINACION|      ESTADO|    LATITUD|    LONGITUD|  f_actual|
+---------------+------------+-------+--------------+-----------------+-----------------+------------+-----------+------------+----------+
|VALLE DEL CAUCA|     BOLÍVAR|3304183|  Producto-Oro|             NULL|             NULL|Con servicio|4.393118861|-76.34982039|2025-09-24|
|VALLE DEL CAUCA|BUENAVENTURA|3267784|Producto-Plata|             NULL|             NULL|Con servicio|3.493340766|-77.11872832|2025-09-24|
|VALLE DEL CAUCA|BUGALAGRANDE|3345424|Producto-Plata|             NULL|             NULL|Con servicio|4.196852991| -76.0896103|2025-09-24|
+---------------+------------+-------+--------------+-----------------+-----------------+------------+-----------+------------+----------+
only showing top 3 rows



In [28]:
import glob
import os

# Ruta temporal donde Spark guarda los archivos Parquet
temp_folder_path = "/tmp/df_inst_divi_sparkDF_parquet"

# Guardar en formato Parquet sin consolidar (múltiples archivos)
df_inst_divi_sparkDF.write.mode("overwrite").parquet(temp_folder_path)

# Buscar todos los archivos Parquet generados en la carpeta
parquet_files = glob.glob(f"{temp_folder_path}/*.parquet")

# Definir bucket y prefijo dentro del bucket
bucket_name = "bronze"  # Asegurar que el bucket no tenga /
folder_prefix = "clientes_inst/"  # Prefijo dentro del bucket

if parquet_files:
    for file_path in parquet_files:
        file_name = os.path.basename(file_path)  # Obtener solo el nombre del archivo
        object_name = folder_prefix + file_name  # Agregar el prefijo de "carpeta"

        # Subir cada archivo Parquet a MinIO con la estructura correcta
        minio_client.upload_file(file_path, bucket_name, object_name)
        print(f"✅ Archivo subido: {file_name} → {bucket_name}/{object_name}")
else:
    print("⚠️ Error: No se encontraron archivos Parquet para subir.")

✅ Archivo subido: part-00000-32d96294-ce8f-4099-8d85-f678c05a76e1-c000.snappy.parquet → bronze/clientes_inst/part-00000-32d96294-ce8f-4099-8d85-f678c05a76e1-c000.snappy.parquet


In [29]:
df_inst_divi_sparkDF.printSchema()

root
 |-- DEPARTAMENTO: string (nullable = true)
 |-- CIUDAD: string (nullable = true)
 |-- SERIAL: integer (nullable = true)
 |-- PRODUCTO: string (nullable = true)
 |-- FECHA_INSTALACION: date (nullable = true)
 |-- FECHA_TERMINACION: date (nullable = true)
 |-- ESTADO: string (nullable = true)
 |-- LATITUD: double (nullable = true)
 |-- LONGITUD: double (nullable = true)
 |-- f_actual: date (nullable = false)



In [26]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.gold;").show()
#SOLO UNA VEZ

In [27]:
#spark.sql("DROP NAMESPACE IF EXISTS nessie.bronze CASCADE")


In [28]:
df_inst_divi_sparkDF.writeTo("nessie.gold.clientes_inst").createOrReplace()


Py4JJavaError: An error occurred while calling o165.createOrReplace.
: org.apache.iceberg.exceptions.NoSuchNamespaceException: Namespace does not exist: gold
	at org.apache.iceberg.nessie.NessieUtil.maybeUseSpecializedException(NessieUtil.java:302)
	at org.apache.iceberg.nessie.NessieUtil.handleExceptionsForCommits(NessieUtil.java:224)
	at org.apache.iceberg.nessie.NessieTableOperations.doCommit(NessieTableOperations.java:125)
	at org.apache.iceberg.BaseMetastoreTableOperations.commit(BaseMetastoreTableOperations.java:135)
	at org.apache.iceberg.BaseTransaction.lambda$commitReplaceTransaction$1(BaseTransaction.java:381)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)
	at org.apache.iceberg.util.Tasks$Builder.runSingleThreaded(Tasks.java:219)
	at org.apache.iceberg.util.Tasks$Builder.run(Tasks.java:203)
	at org.apache.iceberg.util.Tasks$Builder.run(Tasks.java:196)
	at org.apache.iceberg.BaseTransaction.commitReplaceTransaction(BaseTransaction.java:365)
	at org.apache.iceberg.BaseTransaction.commitTransaction(BaseTransaction.java:314)
	at org.apache.iceberg.CommitCallbackTransaction.commitTransaction(CommitCallbackTransaction.java:126)
	at org.apache.iceberg.spark.source.StagedSparkTable.commitStagedChanges(StagedSparkTable.java:34)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.$anonfun$writeToTable$1(WriteToDataSourceV2Exec.scala:580)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable(WriteToDataSourceV2Exec.scala:573)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable$(WriteToDataSourceV2Exec.scala:567)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.writeToTable(WriteToDataSourceV2Exec.scala:183)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.run(WriteToDataSourceV2Exec.scala:216)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriterV2.runCommand(DataFrameWriterV2.scala:196)
	at org.apache.spark.sql.DataFrameWriterV2.internalReplace(DataFrameWriterV2.scala:208)
	at org.apache.spark.sql.DataFrameWriterV2.createOrReplace(DataFrameWriterV2.scala:134)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.projectnessie.error.NessieReferenceConflictException: Namespace 'gold' must exist.
	at org.projectnessie.error.ErrorCode.lambda$asException$1(ErrorCode.java:66)
	at java.base/java.util.Optional.map(Optional.java:260)
	at org.projectnessie.error.ErrorCode.asException(ErrorCode.java:66)
	at org.projectnessie.client.rest.ResponseCheckFilter.checkResponse(ResponseCheckFilter.java:58)
	at org.projectnessie.client.rest.NessieHttpResponseFilter.filter(NessieHttpResponseFilter.java:29)
	at org.projectnessie.client.http.impl.jdk11.JavaRequest.lambda$executeRequest$1(JavaRequest.java:143)
	at java.base/java.util.ArrayList.forEach(ArrayList.java:1511)
	at java.base/java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1092)
	at org.projectnessie.client.http.impl.jdk11.JavaRequest.executeRequest(JavaRequest.java:143)
	at org.projectnessie.client.http.HttpRequest.post(HttpRequest.java:116)
	at org.projectnessie.client.rest.v1.RestV1TreeClient.commitMultipleOperations(RestV1TreeClient.java:204)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at org.projectnessie.client.rest.v1.RestV1Client$ExceptionRewriter.invoke(RestV1Client.java:84)
	at jdk.proxy3/jdk.proxy3.$Proxy41.commitMultipleOperations(Unknown Source)
	at org.projectnessie.client.rest.v1.HttpCommitMultipleOperations.commit(HttpCommitMultipleOperations.java:34)
	at org.apache.iceberg.nessie.NessieIcebergClient.commitContent(NessieIcebergClient.java:687)
	at org.apache.iceberg.nessie.NessieIcebergClient.commitTable(NessieIcebergClient.java:619)
	at org.apache.iceberg.nessie.NessieTableOperations.doCommit(NessieTableOperations.java:120)
	... 55 more


In [None]:
#df_inst_divi_sparkDF.write \
#    .format("iceberg") \
#    .mode("overwrite") \
#    .save("nessie.gold.clientes_inst")

In [None]:
spark.read.table("nessie.gold.clientes_inst").show()


In [29]:
spark.stop()
