In [1]:
from pyspark.sql import SparkSession
import os

In [2]:
userHome_path = os.path.expanduser("~")
userHome_path

'C:\\Users\\Manh'

In [3]:
dir_path = "Documents" + os.path.sep + "hive" + os.path.sep + "spark_warehouse" 

warehouse_dir = os.path.join(userHome_path, dir_path)
warehouse_dir

'C:\\Users\\Manh\\Documents\\hive\\spark_warehouse'

In [4]:
spark = SparkSession.builder \
    .appName("Hive Integration") \
    .config("spark.sql.warehouse.dir", warehouse_dir) \
    .enableHiveSupport() \
    .getOrCreate()

In [5]:
df = spark.read.csv("./dataset/orders_wh.csv", header=True, inferSchema=True)
df.show(5)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 5 rows



In [6]:
df.createOrReplaceTempView("orders_temp")

### Create Database

In [7]:
spark.sql("CREATE DATABASE IF NOT EXISTS sparksql_db")

DataFrame[]

In [8]:
spark.sql("SHOW DATABASES").show()

+-----------+
|  namespace|
+-----------+
|    default|
|sparksql_db|
+-----------+



### Create Managed Table

In [9]:
spark.sql("CREATE TABLE IF NOT EXISTS sparksql_db.orders_managed AS SELECT * FROM orders_temp")

DataFrame[]

In [10]:
spark.sql("SHOW TABLES in sparksql_db").show()

+-----------+--------------+-----------+
|  namespace|     tableName|isTemporary|
+-----------+--------------+-----------+
|sparksql_db|orders_managed|      false|
|           |   orders_temp|       true|
+-----------+--------------+-----------+



In [11]:
spark.sql("DESCRIBE sparksql_db.orders_managed").show()

+------------+---------+-------+
|    col_name|data_type|comment|
+------------+---------+-------+
|    order_id|      int|   NULL|
|  order_date|timestamp|   NULL|
| customer_id|      int|   NULL|
|order_status|   string|   NULL|
+------------+---------+-------+



In [12]:
spark.sql("DESCRIBE EXTENDED sparksql_db.orders_managed").show(30, False)

+----------------------------+--------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                       |comment|
+----------------------------+--------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                             |NULL   |
|order_date                  |timestamp                                                                       |NULL   |
|customer_id                 |int                                                                             |NULL   |
|order_status                |string                                                                          |NULL   |
|                            |                                                                                |       |
|# Detailed Table Information|          

### Create External Table from Managed Table

In [13]:
spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS sparksql_db.orders_external (
    order_id int,
    order_date timestamp,
    customer_id int,
    order_status string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION 'file:/C:/Users/Manh/Documents/hive/spark_warehouse/sparksql_db.db/orders_managed'
""")

DataFrame[]

In [14]:
spark.sql("SHOW TABLES in sparksql_db").show()

+-----------+---------------+-----------+
|  namespace|      tableName|isTemporary|
+-----------+---------------+-----------+
|sparksql_db|orders_external|      false|
|sparksql_db| orders_managed|      false|
|           |    orders_temp|       true|
+-----------+---------------+-----------+



In [15]:
spark.sql("DESCRIBE EXTENDED sparksql_db.orders_external").show(30, False)

+----------------------------+--------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                       |comment|
+----------------------------+--------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                             |NULL   |
|order_date                  |timestamp                                                                       |NULL   |
|customer_id                 |int                                                                             |NULL   |
|order_status                |string                                                                          |NULL   |
|                            |                                                                                |       |
|# Detailed Table Information|          

### Create External Table from CSV File

In [16]:
spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS sparksql_db.orders_external_2 (
    order_id int,
    order_date timestamp,
    customer_id int,
    order_status string
)
USING CSV OPTIONS (path 'C:/Users/Manh/Documents/Learning-Spark/dataset/orders_wh.csv')
""")

DataFrame[]

In [18]:
spark.sql("SHOW TABLES in sparksql_db").show()

+-----------+-----------------+-----------+
|  namespace|        tableName|isTemporary|
+-----------+-----------------+-----------+
|sparksql_db|  orders_external|      false|
|sparksql_db|orders_external_2|      false|
|sparksql_db|   orders_managed|      false|
|           |      orders_temp|       true|
+-----------+-----------------+-----------+



In [17]:
spark.sql("DESCRIBE EXTENDED sparksql_db.orders_external_2").show(30, False)

+----------------------------+------------------------------------------------------------------+-------+
|col_name                    |data_type                                                         |comment|
+----------------------------+------------------------------------------------------------------+-------+
|order_id                    |int                                                               |NULL   |
|order_date                  |timestamp                                                         |NULL   |
|customer_id                 |int                                                               |NULL   |
|order_status                |string                                                            |NULL   |
|                            |                                                                  |       |
|# Detailed Table Information|                                                                  |       |
|Catalog                     |spark_catalog   