In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (SparkSession.builder
             .master("spark://spark-master:7077") # Points to the Spark Cluster
             .appName('schema-test') # Name the app
             .config("hive.metastore.uris", "thrift://hive-metastore:9083") # Set external Hive Metastore
             .config("hive.metastore.warehouse.dir", "hdfs://hdfs-namenode:9000/hadoop/warehouse/") # Set default warehouse dir (legacy) users/hive/warehouse
             .config("spark.sql.warehouse.dir", "hdfs://hdfs-namenode:9000/hadoop/warehouse/") # Set default warehouse dir
             .config("hive.metastore.schema.verification", "false") # Prevent some errors
             .config("fs.defaultFS", "hdfs://hdfs-namenode:9000/") # Set default file system into the HDFS namenode
             .enableHiveSupport()
             .getOrCreate())

sc = spark.sparkContext

In [4]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
|     gold|
|   silver|
+---------+



In [21]:
spark.sql("CREATE DATABASE IF NOT EXISTS bronze LOCATION 'hdfs://hdfs-namenode:9000/hadoop/warehouse/bronze/'").show()

++
||
++
++



In [22]:
spark.sql("CREATE DATABASE IF NOT EXISTS silver LOCATION 'hdfs://hdfs-namenode:9000/hadoop/warehouse/silver/'").show()

++
||
++
++



In [23]:
spark.sql("CREATE DATABASE IF NOT EXISTS gold LOCATION 'hdfs://hdfs-namenode:9000/hadoop/warehouse/gold/'").show()

++
||
++
++



In [30]:
bronze_clients_schema = StructType([
    StructField("client_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("birthdate", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("event", StringType(), True),
    StructField("timestamp", StringType(), True)
])

bronze_clients_df = spark.createDataFrame([], schema = bronze_clients_schema)
schema_str = ", ".join([f"{x[0]} {x[1]}" for x in bronze_clients_df.dtypes ])

# spark.sql("DROP TABLE IF EXISTS bronze.clients")
spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS bronze.clients ({schema_str}) USING PARQUET LOCATION 'hdfs://hdfs-namenode:9000/hadoop/warehouse/bronze/clients/'")

DataFrame[]

In [8]:
spark.sql("show create table bronze.clients").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                                                                                                                                                                                                                           |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CRE