In [1]:
sc

In [2]:
spark

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession,HiveContext

In [4]:
sc.stop()

In [5]:
config = SparkConf().setAppName('SparkHiveSession').setMaster('local[4]')
sc = SparkContext.getOrCreate(conf=config)

In [6]:
sc

In [7]:
# Spark Integration with Hive with SPARK Session
spark = (SparkSession.builder.appName("pyspark-hive-integration")
        .config('spark.sql.warehouse.dir','/user/hive/warehouse/')
        .enableHiveSupport().getOrCreate())


In [8]:
spark

In [9]:
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



In [10]:
spark.sql("create database if not exists banking_db")

DataFrame[]

In [11]:
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|  banking_db|
|     default|
+------------+



In [12]:
spark.sql("use banking_db").show()

++
||
++
++



In [13]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [17]:
spark.sql("drop table if exists banking")

DataFrame[]

In [18]:
spark.sql("""
CREATE TABLE if not exists banking (age int,balance double,campaign double,
contact string,day int,default string,duration int,education varchar(50),
housing varchar(10),job varchar(50),loan varchar(10),marital string,month varchar(30),
pdays double,poutcome string,previous int,y varchar(10))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
tblProperties("skip.header.line.count" = 1)
""")

DataFrame[]

In [19]:
spark.sql("show tables").show()

+----------+---------+-----------+
|  database|tableName|isTemporary|
+----------+---------+-----------+
|banking_db|  banking|      false|
+----------+---------+-----------+



In [20]:
#there are two types of tables in hive external and manage 
spark.sql("describe formatted banking").show()

+--------------------+----------+-------+
|            col_name| data_type|comment|
+--------------------+----------+-------+
|                 age|       int|   null|
|             balance|    double|   null|
|            campaign|    double|   null|
|             contact|    string|   null|
|                 day|       int|   null|
|             default|    string|   null|
|            duration|       int|   null|
|           education|    string|   null|
|             housing|    string|   null|
|                 job|    string|   null|
|                loan|    string|   null|
|             marital|    string|   null|
|               month|    string|   null|
|               pdays|    double|   null|
|            poutcome|    string|   null|
|            previous|       int|   null|
|                   y|    string|   null|
|                    |          |       |
|# Detailed Table ...|          |       |
|            Database|banking_db|       |
+--------------------+----------+-

In [21]:
spark.sql("describe formatted banking").collect()

[Row(col_name='age', data_type='int', comment=None),
 Row(col_name='balance', data_type='double', comment=None),
 Row(col_name='campaign', data_type='double', comment=None),
 Row(col_name='contact', data_type='string', comment=None),
 Row(col_name='day', data_type='int', comment=None),
 Row(col_name='default', data_type='string', comment=None),
 Row(col_name='duration', data_type='int', comment=None),
 Row(col_name='education', data_type='string', comment=None),
 Row(col_name='housing', data_type='string', comment=None),
 Row(col_name='job', data_type='string', comment=None),
 Row(col_name='loan', data_type='string', comment=None),
 Row(col_name='marital', data_type='string', comment=None),
 Row(col_name='month', data_type='string', comment=None),
 Row(col_name='pdays', data_type='double', comment=None),
 Row(col_name='poutcome', data_type='string', comment=None),
 Row(col_name='previous', data_type='int', comment=None),
 Row(col_name='y', data_type='string', comment=None),
 Row(col_na

## DDL method to load dataset  into hive Table

In [22]:
#spark.sql("""load data local inpath '/home/hadoop/Downloads/the csv'
# overite into table banking""")--this is how to upload from local file path data into hive table using spark sql

### Create a Spark DataFrame

In [23]:
from pyspark.sql.types import *

In [24]:
bank_customer_Data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json",multiLine=True)

In [26]:
bank_customer_Data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [27]:
bank_customer_Data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



### insert records from spark Temp table to hive table 

In [28]:
bank_customer_Data.createOrReplaceTempView('banktable')

In [29]:
spark.sql("""
insert into table banking
select * from banktable 
""")

DataFrame[]

In [31]:
spark.sql("select * from banking").show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58| 2143.0|     1.0|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may| -1.0| unknown|       0| no|
| 44|   29.0|     1.0|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may| -1.0| unknown|       0| no|
| 33|    2.0|     1.0|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may| -1.0| unknown|       0| no|
| 47| 1506.0|     1.0|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may| -1.0| unknown|       0| no|
| 33|    1.0|     1.0|unknown|  5|     no|     198|  unknown|     no|     unknown| 