In [1]:
%load_ext sparksql_magic

In [2]:
import os
import socket
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [3]:
# Pyspark config
os.environ['PYSPARK_PYTHON'] = 'python3.11.8'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.11.8'
# Create Spark config for our Kubernetes based cluster manager
SPARK_PORT_MAX_RETRIES = 2
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"

In [None]:
conf = SparkConf().setAppName("spark-app-miguel-20250701")
conf.setMaster("k8s://https://kubernetes.default:443")
conf.set("spark.submit.deployMode", "client")
conf.set("spark.kubernetes.namespace", os.getenv("POD_NAMESPACE",default="jupyterhub"))
conf.set("spark.kubernetes.container.image", "miguelmanuttupa/pyspark-k8s-python3.11:3.5.0")
conf.set("spark.kubernetes.container.image.pullPolicy","IfNotPresent")
conf.set("spark.kubernetes.pyspark.pythonVersion", "3")
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark-sa-jupyterhub")
conf.set("spark.driver.host", socket.gethostbyname(socket.gethostname()))
conf.set("spark.kubernetes.executor.deleteOnTermination","true")
# conf.set("spark.driver.port", "2222")
# conf.set("spark.blockManager.port", "7777")
conf.set("spark.port.maxRetries", "2")
# RESOURCES
conf.set("spark.executor.instances", "1")
conf.set("spark.executor.cores", "2")
conf.set("spark.executor.memory", "4G")
# MINIO / S3
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.endpoint", "http://minio.data-services.svc.cluster.local:9000")
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3a.access.key", "spark-key")
conf.set("spark.hadoop.fs.s3a.secret.key", "spark-secret")
# DELTA LAKE
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
# HIVE
conf.set("spark.sql.catalogImplementation", "hive")
conf.set("spark.sql.warehouse.dir", "s3a://warehouse/")
conf.set("hive.metastore.uris", "thrift://hive-metastore.data-services.svc.cluster.local:9083")

<pyspark.conf.SparkConf at 0x7f491c628090>

In [5]:
sc = SparkContext.getOrCreate(conf=conf)

In [6]:
spark = SparkSession(sc)

In [7]:
df = spark.read.format("parquet").load("s3a://landing/STA_TIPO_CAMBIO")

In [8]:
df.cache().count()

12777

In [9]:
%%sparksql
SHOW DATABASES;

0
namespace
default


In [10]:
df.write.mode("overwrite").saveAsTable("default.STA_TIPO_CAMBIO")

In [11]:
df.write.format("delta").mode("overwrite").saveAsTable("default.ANA_TIPO_CAMBIO")

In [17]:
spark.stop()