In [1]:
#ch19_SparkSQL_00.ipynb,  Spark SQL, DataFrames  Guide

In [2]:
sc.master

u'local[*]'

In [3]:
#Starting Point: SparkSession
#The entry point into all functionality in Spark is the SparkSession class. 
#To create a basic SparkSession, just use SparkSession.builder:

In [4]:
#step01: Starting Point: SparkSession

In [5]:
from pyspark.sql import SparkSession #import SparkSession 

In [6]:
#create SparkSession Object "spark01"
spark01 = SparkSession.builder.master("local[*]").config("spark.driver.cores", 1).appName("ch19_SparkSQL_00").getOrCreate()

In [7]:
#step02: Creating DataFrames

In [8]:
# spark01 is an existing SparkSession
df01 = spark01.read.json("people.json") #spark.read.json(), create DataFrame df01

In [9]:
df01.show() #display DataFrame df01

+---+-------+------+
|age|   name|status|
+---+-------+------+
| 29|Michael|     1|
| 30|   Andy|     0|
| 19| Justin|     1|
| 40|   Jack|     1|
| 34|   Mary|     0|
+---+-------+------+



In [10]:
#step03: Print the schema in a tree format
df01.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)
 |-- status: long (nullable = true)



In [12]:
# Select only the "name" column
df01.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
|   Jack|
|   Mary|
+-------+



In [13]:
# Select everybody, but increment the age by 1
df01.select(df01['name'], df01['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|       30|
|   Andy|       31|
| Justin|       20|
|   Jack|       41|
|   Mary|       35|
+-------+---------+



In [14]:
# Select people older than 21
df01.filter(df01['age'] > 21).show()

+---+-------+------+
|age|   name|status|
+---+-------+------+
| 29|Michael|     1|
| 30|   Andy|     0|
| 40|   Jack|     1|
| 34|   Mary|     0|
+---+-------+------+



In [15]:
# Count people by age
df01.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 29|    1|
| 19|    1|
| 34|    1|
| 30|    1|
| 40|    1|
+---+-----+



In [16]:
#============================================================================================================
#Running SQL Queries Programmatically
#The sql function on a SparkSession enables applications to run SQL queries programmatically and returns the result as a DataFrame.
#============================================================================================================

In [17]:
#step01:  Register the DataFrame as a SQL temporary view, DataFrame.createOrReplaceTempView() 
df01.createOrReplaceTempView("people")   #TempView "people"

In [20]:
sqldf01 = spark01.sql("SELECT * FROM people WHERE age > 30")  #SparkSession.sql("SQL Script"), return a DataFrame "sqldf01"

In [21]:
sqldf01.show()

+---+----+------+
|age|name|status|
+---+----+------+
| 40|Jack|     1|
| 34|Mary|     0|
+---+----+------+



In [22]:
#=====================================================================================================
#Global Temporary View ; spark 2.2
#Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it terminates.
#If you want to have a temporary view that is shared among all sessions and keep alive until the Spark application terminates, 
#you can create a global temporary view.
#======================================================================================================

In [24]:
#step01: Register the DataFrame as a global temporary view
df01.creat

AttributeError: 'DataFrame' object has no attribute 'createGlobalTempView'