In [1]:
from pyspark import SparkContext
# initialize a new Spark Context to use for the execution of the script
sc = SparkContext(appName="MY-APP-NAME", master="local[*]")

In [2]:
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

In [17]:
from pyspark.sql import Row

# Load a text file and convert each line to a Row.
lines = sc.textFile("dataSQL.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

# Infer the schema, and register the DataFrame as a table.
schemaPeople = sqlCtx.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")
schemaPeople.createOrReplaceTempView("others")

In [21]:
# SQL can be run over SchemaRDDs that have been registered # as a table.
teenagers = sqlCtx.sql("""SELECT name FROM people WHERE age >= 13 AND age <= 19""")
teenagers.show()

teenagers2 = sqlCtx.sql("""SELECT * FROM others, people WHERE others.age = people.age """)
teenagers2.show()

teenagers3 = sqlCtx.sql("""SELECT age%5, count(*) FROM people group by age%5""")
teenagers3.show()

+----------+
|      name|
+----------+
|    Giulia|
|   Camilla|
|   Massimo|
|Giancarclo|
+----------+

+---+----------+---+----------+
|age|      name|age|      name|
+---+----------+---+----------+
| 34|  Giovanna| 34|  Giovanna|
| 50|     Mauro| 50|     Mauro|
| 31|      Andy| 31|      Andy|
| 27|   Lorenzo| 27|   Lorenzo|
| 17|Giancarclo| 17|Giancarclo|
| 41|   Roberto| 41|   Roberto|
| 18|   Massimo| 18|   Massimo|
| 15|    Giulia| 15|    Giulia|
| 15|    Giulia| 15|   Camilla|
| 15|   Camilla| 15|    Giulia|
| 15|   Camilla| 15|   Camilla|
| 30|   Michael| 30|   Michael|
+---+----------+---+----------+

+-------------------------+--------+
|(age % CAST(5 AS BIGINT))|count(1)|
+-------------------------+--------+
|                        0|       4|
|                        1|       2|
|                        3|       1|
|                        2|       2|
|                        4|       1|
+-------------------------+--------+



In [10]:
# Load directly in a Dataframe
df = sqlCtx.read.load("dataSQL.txt",
                     format="csv", sep=",", inferSchema="true", header="false")
df.show()

df = df.withColumnRenamed("_c0", "name")
df = df.withColumnRenamed("_c1", "age")
df.show()

+----------+----+
|       _c0| _c1|
+----------+----+
|   Michael|30.0|
|      Andy|31.0|
|   Roberto|41.0|
|   Lorenzo|27.0|
|    Giulia|15.0|
|   Camilla|15.0|
|     Mauro|50.0|
|   Massimo|18.0|
|  Giovanna|34.0|
|Giancarclo|17.0|
+----------+----+

+----------+----+
|      name| age|
+----------+----+
|   Michael|30.0|
|      Andy|31.0|
|   Roberto|41.0|
|   Lorenzo|27.0|
|    Giulia|15.0|
|   Camilla|15.0|
|     Mauro|50.0|
|   Massimo|18.0|
|  Giovanna|34.0|
|Giancarclo|17.0|
+----------+----+



In [11]:
# Register the DataFrame as a global temporary view
schemaPeople.createOrReplaceGlobalTempView("peoplegl");
sqlCtx.sql("SELECT * FROM global_temp.peoplegl").show();



+---+----------+
|age|      name|
+---+----------+
| 30|   Michael|
| 31|      Andy|
| 41|   Roberto|
| 27|   Lorenzo|
| 15|    Giulia|
| 15|   Camilla|
| 50|     Mauro|
| 18|   Massimo|
| 34|  Giovanna|
| 17|Giancarclo|
+---+----------+



In [14]:
# Global temporary view is cross-session
sqlCtx.newSession().sql("SELECT * FROM global_temp.peoplegl").show();
# Temporary view doesn't (Next line gives you an error if executed!)
#sqlCtx.newSession().sql("SELECT * FROM people").show();

+---+----------+
|age|      name|
+---+----------+
| 30|   Michael|
| 31|      Andy|
| 41|   Roberto|
| 27|   Lorenzo|
| 15|    Giulia|
| 15|   Camilla|
| 50|     Mauro|
| 18|   Massimo|
| 34|  Giovanna|
| 17|Giancarclo|
+---+----------+



In [22]:
# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name)
print (teenNames.collect())

['Name: Giulia', 'Name: Camilla', 'Name: Massimo', 'Name: Giancarclo']


In [10]:
import numpy as np
import pandas as pd

# Generate a Pandas DataFrame
pdf = pd.DataFrame(np.random.rand(100, 3))

# Create a Spark DataFrame from a Pandas DataFrame using Arrow
df = sqlCtx.createDataFrame(pdf)

# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = df.select("*").toPandas()

print(result_pdf)

           0         1         2
0   0.470191  0.547261  0.286403
1   0.110223  0.260530  0.975768
2   0.316238  0.697006  0.703357
3   0.728138  0.017595  0.061412
4   0.011980  0.870059  0.689020
..       ...       ...       ...
95  0.507561  0.546946  0.104638
96  0.833201  0.812468  0.782769
97  0.643570  0.820078  0.872547
98  0.962956  0.619839  0.238349
99  0.158338  0.463717  0.693046

[100 rows x 3 columns]
