# SQL Context

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import Row

##### Create spark and sql context

In [None]:
sc = SparkContext("local", "sqlContext")
sqc = SQLContext(sc)

##### Load data

In [None]:
# hardcoded
data = [('Alice', 1), ('Martha', 2), ('Anna', 3)]
data

In [None]:
# from csv
data = sc.textFile('D:/data/csv/hello_world.csv')
data = data.filter(lambda x: x.split(',')).map(lambda line: line.split(','))

In [134]:
df = sqc.createDataFrame(data, ['name', 'age'])
res = df.collect()
res

[Row(name='Alice', age='43'),
 Row(name='Martha', age='44'),
 Row(name='Anna', age='45')]

##### Parallelize

In [133]:
rdd = sc.parallelize(res)
df1 = sqc.createDataFrame(rdd)
df1.collect()

[Row(name='Alice', age='43'),
 Row(name='Martha', age='44'),
 Row(name='Anna', age='45')]

In [137]:
df = sqc.createDataFrame(rdd, ['name : string, age : int'])
res = df.collect()
res

[Row(name : string, age : int='Alice', age='43'),
 Row(name : string, age : int='Martha', age='44'),
 Row(name : string, age : int='Anna', age='45')]

##### Row

In [None]:
Person = Row('name', 'age')
person = rdd.map(lambda r: Person(*r))
df2 = sqc.createDataFrame(person)
df2.collect()

##### Pandas

In [None]:
dfp = df1.toPandas()
dfp

In [None]:
sqc.createDataFrame(dfp).collect()  

##### SQL query

In [None]:
sqc.registerDataFrameAsTable(df1, "table1")
df2 = sqc.sql("SELECT table1._1 AS Name, table1._2 as Age FROM table1 WHERE table1._1='Martha'")
df2.collect()

In [None]:
sqc.registerDataFrameAsTable(df1, "table1")
df2 = sqc.table("table1")
#df2.collect()
df2.toPandas()

In [None]:
rdd = rdd.map(lambda row: row[1])
spark.createDataFrame(rdd, "int").collect()
[Row(value=1)]
spark.createDataFrame(rdd, "boolean").collect() 

In [138]:
sc.stop()

##### Credits & Links

https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.SQLContext