In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("SparkHiveDatabase")

# centarlized hive meta server
# hdfs hive for data warehouse
# enable hive support must for sql database
 
config.set("spark.local.dir", "/home/ubuntu/spark-temp")

# while using hive.metastore.warehouse.dir, we should not use spark warehouse dir

config.set("hive.metastore.uris", "thrift://localhost:9083")
config.set("hive.metastore.warehouse.dir", "hdfs://localhost:9000/user/hive/warehouse")


from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame

# enableHiveSupport() now using hive meta server running as server
# multiple notebooks can share hive meta server, work in parallel
# we use hive warehouse directory for spark too, this way hive and spark can co-exists
# metastore shall have meta data: database, tables, columns, data types, where exactly
# data located in hdfs or file system or s3

spark = SparkSession.builder\
                    .config(conf=config)\
                    .enableHiveSupport()\
                    .getOrCreate()

sc = spark.sparkContext

In [4]:
spark.sql("SHOW DATABASES").show()

+------------+
|databaseName|
+------------+
|    brandsdb|
|     default|
|     moviedb|
|  productsdb|
|    removeme|
|      testdb|
+------------+



In [5]:
spark.sql("SHOW TABLES IN moviedb").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| moviedb|   movies|      false|
| moviedb|  ratings|      false|
+--------+---------+-----------+



In [6]:
spark.sql("select * from moviedb.movies").show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|    null|               title|              genres|
|       1|    Toy Story (1995)|Adventure|Animati...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|  Adventure|Children|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11| "American President|         The (1995)"|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Adventure|Animati...|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventur

In [7]:
# Location?? /hive/user/warehose/moviedb.db/reviews
spark.sql("""
CREATE TABLE moviedb.reviews(user_id int, movie_id int, comments string)
"""
)

DataFrame[]

In [9]:
spark.sql("SHOW TABLES in moviedb").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| moviedb|   movies|      false|
| moviedb|  ratings|      false|
| moviedb|  reviews|      false|
+--------+---------+-----------+



In [10]:
spark.sql("""
INSERT INTO moviedb.reviews VALUES (1, 1, 'nice movie')
"""
         )

DataFrame[]

In [11]:
# Spark is a query engine now
# spark uses hive meta store as central catalog
# spark itself query hdfc, take files, scan records
spark.sql("SELECT * FROM moviedb.reviews").show()

+-------+--------+----------+
|user_id|movie_id|  comments|
+-------+--------+----------+
|      1|       1|nice movie|
+-------+--------+----------+

