In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TABDG8/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW DATABASES
    """
    
).show()

+---------+
|namespace|
+---------+
|    covid|
|  default|
|  economy|
| politics|
+---------+



In [3]:
spark.sql(
    """
    DROP DATABASE IF EXISTS economy CASCADE
    """
)

DataFrame[]

In [4]:
# you can choose any location in HDFS, just be organized 
# Your data lake will grow with time and will become a swamp
spark.sql(
    """
    CREATE DATABASE economy LOCATION 'hdfs://hdfs-nn:9000/TABDG8/warehouse/economy.db/'
    """
)

DataFrame[]

In [5]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|    covid|
|  default|
|  economy|
| politics|
+---------+



In [6]:
spark.sql(
    """
    SHOW TABLES FROM economy
    """
).show() 

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [7]:
spark.sql(
    """
    DROP TABLE IF EXISTS economy.text_table
    """
)

DataFrame[]

In [8]:
spark.sql(
    """
    DROP TABLE IF EXISTS economy
    """
)

spark.sql(
    """
    CREATE TABLE economy.text_table (
        year INT,
        state VARCHAR(70),
        household_median_income FLOAT,
        min_wage_state FLOAT,
        min_wage_federal FLOAT,
        cpi_average FLOAT,
        unemployed INT,
        employed INT,
        employable_pop INT,
        gdp_state FLOAT
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    LOCATION 'hdfs://hdfs-nn:9000/TABDG8/warehouse/economy.db/text_table/'
    """
)

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

DataFrame[]

In [9]:
spark.sql(
    """
    DROP TABLE IF EXISTS economy.parquet_table
    """
)

spark.sql(
    """
    CREATE TABLE economy.parquet_table (
        state VARCHAR(70),
        household_median_income FLOAT,
        min_wage_state FLOAT,
        min_wage_federal FLOAT,
        cpi_average FLOAT,
        unemployed INT,
        employed INT,
        employable_pop INT,
        gdp_state FLOAT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year INT
    )
    LOCATION 'hdfs://hdfs-nn:9000/TABDG8/warehouse/economy.db/parquet_table/'
    """
)

DataFrame[]

In [4]:
spark.sql(
    """
    DROP TABLE IF EXISTS economy.parquet_economy
    """
)

spark.sql(
    """
    CREATE TABLE economy.parquet_table (
        state VARCHAR(70),
        household_median_income FLOAT,
        min_wage_state FLOAT,
        min_wage_federal FLOAT,
        cpi_average FLOAT,
        unemployed INT,
        employed INT,
        employable_pop INT,
        gdp_state FLOAT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year YEAR
    )
    LOCATION 'hdfs://hdfs-nn:9000/TABDG8/warehouse/economy.db/parquet_economy/'
    """
)

ParseException: 
DataType year is not supported.(line 15, pos 13)

== SQL ==

    CREATE TABLE economy.parquet_table (
        state VARCHAR(70),
        household_median_income FLOAT,
        min_wage_state FLOAT,
        min_wage_federal FLOAT,
        cpi_average FLOAT,
        unemployed INT,
        employed INT,
        employable_pop INT,
        gdp_state FLOAT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year YEAR
-------------^^^
    )
    LOCATION 'hdfs://hdfs-nn:9000/TABDG8/warehouse/economy.db/parquet_economy/'
    


In [10]:
spark.sql(
    """
    SHOW TABLES FROM economy
    """
).show()

+--------+-------------+-----------+
|database|    tableName|isTemporary|
+--------+-------------+-----------+
| economy|parquet_table|      false|
| economy|   text_table|      false|
+--------+-------------+-----------+



In [11]:
# Let's look into HDFS

In [12]:
spark.sql(
    """
    SELECT *
    FROM economy.text_table
    """
).show()

+----+-----+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+
|year|state|household_median_income|min_wage_state|min_wage_federal|cpi_average|unemployed|employed|employable_pop|gdp_state|
+----+-----+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+
+----+-----+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+



In [13]:
spark.sql(
    """
    SELECT *
    FROM economy.parquet_table
    """
).show()

+-----+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+----+
|state|household_median_income|min_wage_state|min_wage_federal|cpi_average|unemployed|employed|employable_pop|gdp_state|year|
+-----+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+----+
+-----+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+----+



In [14]:
spark.sql(
    """
    DESCRIBE FORMATTED economy.text_table
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,year,int,
1,state,varchar(70),
2,household_median_income,float,
3,min_wage_state,float,
4,min_wage_federal,float,
5,cpi_average,float,
6,unemployed,int,
7,employed,int,
8,employable_pop,int,
9,gdp_state,float,


In [15]:
spark.sql(
    """
    DESCRIBE FORMATTED economy.parquet_table
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,state,varchar(70),
1,household_median_income,float,
2,min_wage_state,float,
3,min_wage_federal,float,
4,cpi_average,float,
5,unemployed,int,
6,employed,int,
7,employable_pop,int,
8,gdp_state,float,
9,year,int,


In [None]:
# Let's put the files into HDFS

In [16]:
spark.sql(
    """
    SELECT *
    FROM economy.text_table
    """
).toPandas()

Unnamed: 0,year,state,household_median_income,min_wage_state,min_wage_federal,cpi_average,unemployed,employed,employable_pop,gdp_state


In [18]:
# recover partitions is needed so that the Hive Metastore (Catalog)
# is updated. Otherwise Hive and the querying engines do not know
# that there are new parittions in the partitioned table.
spark.catalog.recoverPartitions("economy.parquet_table")

spark.sql(
    """
    SELECT *
    FROM economy.parquet_table
    """
).toPandas()

Unnamed: 0,state,household_median_income,min_wage_state,min_wage_federal,cpi_average,unemployed,employed,employable_pop,gdp_state,year


In [20]:
spark.stop()