In [21]:
from pyspark import SparkContext
# initialize a new Spark Context to use for the execution of the script
sc = SparkContext(appName="MY-APP-NAME", master="local[*]")

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=MY-APP-NAME, master=local[*]) created by __init__ at <ipython-input-1-4592087ae9ae>:3 

In [14]:
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

In [18]:
from pyspark.sql import Row

# Load a text file and convert each line to a Row.
lines = sc.textFile("dataSQL.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

# Infer the schema, and register the DataFrame as a table.
schemaPeople = sqlCtx.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

In [20]:
# SQL can be run over SchemaRDDs that have been registered # as a table.
teenagers = sqlCtx.sql("""SELECT name FROM people WHERE age >= 13 AND age <= 19""")
teenagers.show()

+----------+
|      name|
+----------+
|    Giulia|
|   Camilla|
|   Massimo|
|Giancarclo|
+----------+



In [24]:
# Register the DataFrame as a global temporary view
schemaPeople.createOrReplaceGlobalTempView("peoplegl");
sqlCtx.sql("SELECT * FROM global_temp.peoplegl").show();



+---+----------+
|age|      name|
+---+----------+
| 30|   Michael|
| 31|      Andy|
| 41|   Roberto|
| 27|   Lorenzo|
| 15|    Giulia|
| 15|   Camilla|
| 50|     Mauro|
| 18|   Massimo|
| 34|  Giovanna|
| 17|Giancarclo|
+---+----------+



In [27]:
# Global temporary view is cross-session
sqlCtx.newSession().sql("SELECT * FROM global_temp.peoplegl").show();
# Temporary view doesn't (Next line gives you an error if executed!)
# sqlCtx.newSession().sql("SELECT * FROM global_temp.people").show();

+---+----------+
|age|      name|
+---+----------+
| 30|   Michael|
| 31|      Andy|
| 41|   Roberto|
| 27|   Lorenzo|
| 15|    Giulia|
| 15|   Camilla|
| 50|     Mauro|
| 18|   Massimo|
| 34|  Giovanna|
| 17|Giancarclo|
+---+----------+



In [30]:
# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name)
print (teenNames.collect())

['Name: Giulia', 'Name: Camilla', 'Name: Massimo', 'Name: Giancarclo']
