In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (SparkSession.builder
             .master("spark://spark-master:7077") # Points to the Spark Cluster
             .appName('schema-test') # Name the app
             .config("hive.metastore.uris", "thrift://hive-metastore:9083") # Set external Hive Metastore
             .config("hive.metastore.warehouse.dir", "s3a://minio:9000/datalake/") # Set default warehouse dir (legacy) users/hive/warehouse
             .config("spark.sql.warehouse.dir", "s3a://minio:9000/datalake/") # Set default warehouse dir
             .config("hive.metastore.schema.verification", "false") # Prevent some errors
             .config("fs.defaultFS", "s3a://minio:9000/datalake/") # Set default file system into the HDFS namenode
             .config("spark.jars", "/opt/bitnami/spark/jars_external/hadoop-aws-3.3.4.jar,/opt/bitnami/spark/jars_external/aws-java-sdk-bundle-1.12.588.jar")
             .enableHiveSupport()
             .getOrCreate())

sc = spark.sparkContext

hdp_configs = {
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.access.key": "minio",
    "fs.s3a.secret.key": "minioadmin",
    "fs.s3a.connection.timeout": "600000",
    "spark.sql.debug.maxToStringFields": "100",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "fs.s3a.connection.ssl.enabled": "true"
}

for k,v in hdp_configs.items():
    spark.sparkContext._jsc.hadoopConfiguration().set(k, v)


In [2]:
df = spark.createDataFrame([
    {"name": "Jhon", "age": 35},
    {"name": "Eric", "age": 31}
], schema=StructType([StructField("name", StringType(), True), StructField("age", IntegerType(), True)]))

df.show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 7

KeyboardInterrupt: 

In [3]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [3]:
spark.sql("CREATE EXTERNAL TABLE source.persons (name string, age int) USING PARQUET LOCATION 's3a://datalake/source/persons/'").show()

++
||
++
++



In [4]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
|   source|
+---------+



In [6]:
spark.sql("show tables from source").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|   source|    final|      false|
|   source|  persons|      false|
+---------+---------+-----------+



In [7]:
df.write.insertInto("source.persons")

In [8]:
spark.sql("select * from source.persons").show()

+----+---+
|name|age|
+----+---+
|Jhon| 35|
|Eric| 31|
+----+---+



In [3]:
a = spark.read.text("hdfs://hdfs-namenode:9000/hadoop/warehouse/nifi_test/72287f8a-0765-447c-b10a-ad812cc8535e.json")
a.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                    |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{"client_id":49,"name":"a","gender":"b","birthdate":"2000-01-01","address":"c","city":"e","state":"f","event":"create","timestamp":"2023-11-13T23:16:41.215105Z","data_criacao":"2023-11-13T23:16:42Z"}]|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
a = spark.read.parquet("hdfs://hdfs-namenode:9000/hadoop/warehouse/landing/clients/")
a.show(truncate=False)

+---------+----+------+----------+-------+----+-----+------+---------------------------+
|client_id|name|gender|birthdate |address|city|state|event |timestamp                  |
+---------+----+------+----------+-------+----+-----+------+---------------------------+
|51       |a   |b     |2000-01-01|c      |e   |f    |create|2023-11-13T23:46:04.121484Z|
+---------+----+------+----------+-------+----+-----+------+---------------------------+

