In [1]:
# example. using local meta store, using hdfs to store data
# meta store is not usable in other notebooks, as it is embedeed locked jvm internally


In [None]:
import findspark
findspark.init()

In [None]:
"""
Spark Database 
Basic Database:
Now we will HDFS location for spark temp data, also spark datawarehouse directory,

however metastore db/meta data is embeeded inside spark, mean not usable for other application


In production, you will be using Hive Data Catalog
"""

In [None]:
"""
Spark HDFS Location
Only for dev only, not for production

3 components involved

1. meta data - database name, tables, columns data types, location where data stored
    is managed by hive, hive internally uses derby db to store all meta data
2. spark temporary location  "spark.local.dir", "hdfs://localhost:9000/spark-temp"
    where temp data used internally stored
    
3. "spark.sql.warehouse.dir", "hdfs://localhost:9000/spark-warehouse" spark data warehouse where all the database data shall be stored
    we can see database, tables, their data where meta data ,table name, columns are stored in 
    meta data
    
"""

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("SparkDatabase")

# embedded, simple, local spark database/warehouse
# spark will store temporary files
# enable hive support must for sql database
# enable hiveSupport hive catalog to be embedded inside working directory
# spark temp data goes to "hdfs://localhost:9000/spark-temp"
config.set("spark.local.dir", "/home/ubuntu/spark-temp")
# spark data [not meta data] goes into  "/home/ubuntu/spark-warehouse"
config.set("spark.sql.warehouse.dir", "hdfs://localhost:9000/spark-warehouse")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame

# enableHiveSupport() create a meta catalog/database using derby database
# inside current working directory, embedded into spark notebook,
# multiple notebooks cannot share at same time.
# inside pyspark-notebooks, you could see metastore_db
# metastore shall have meta data: database, tables, columns, data types, where exactly
# data located in hdfs or file system or s3
# derby.log - derby database log 
## metastore_db 

spark = SparkSession.builder\
                    .config(conf=config)\
                    .enableHiveSupport()\
                    .getOrCreate()

sc = spark.sparkContext

In [3]:
# from metadata
df = spark.sql("SHOW DATABASES")

df.show()

+------------+
|databaseName|
+------------+
|     default|
|   productdb|
+------------+



In [11]:
# meta data local, but data directory should be in hdfs spark-warehouse
# hdfs://localhost:9000/spark-warehouse/spark-warehouse/stocklocaldb.db
spark.sql("CREATE DATABASE IF NOT EXISTS stocklocaldb")

DataFrame[]

In [12]:
# Create spark Managed table
# we have to use spark sql like insert, (update, delete won't work at 2.x)
# to add data
# create databsae called stocklocaldb
# hdfs://localhost:9000/spark-warehouse/spark-warehouse/stocklocaldb.db/stocks
spark.sql("CREATE TABLE  IF NOT EXISTS stocklocaldb.stocks(symbol STRING, industry STRING)")

DataFrame[]

In [13]:
spark.sql("""
 INSERT INTO stocklocaldb.stocks VALUES('INFY', 'IT')
"""
         )

DataFrame[]

In [14]:
spark.sql("SELECT * FROM stocklocaldb.stocks").show()

+------+--------+
|symbol|industry|
+------+--------+
|  INFY|      IT|
|  INFY|      IT|
+------+--------+



In [13]:
# this command drop the table from meta data store and drop the in the 
# spark datawarehouse directory

spark.sql("DROP TABLE IF EXISTS stocklocaldb.stocks")

DataFrame[]

In [14]:
spark.sql("SHOW TABLES").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [15]:
# if no table exists, no data inside, then it drop the database
# drop the metadata too..
spark.sql("DROP DATABASE IF EXISTS stocklocaldb")

DataFrame[]

In [16]:
spark.sql("SHOW DATABASES").show()

+------------+
|databaseName|
+------------+
|     default|
+------------+

