## Print the files

In [7]:
!cat data/people.txt

Michael, 29
Andy, 30
Justin, 19

In [1]:
!cat data/people.json

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

## Start Spark Context (spark entry)  and Spark Session (dataframes)

In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

## JSON to SQL         
### Semi-structured Data

In [6]:
# read json file into a DataFrame
from pyspark import SQLContext

#Start an sqlContext
sqlContext = SQLContext(sc)

#Read Json
people = sqlContext.read.json("data/people.json")

#Register the people df as a table
people.registerTempTable("people")

#Perform SQL Query
all_people = spark.sql("SELECT * FROM people ")
all_people.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



## Text File to SQL 
###  Unstructured Data

In [9]:
from pyspark.sql import Row

# Load a text file and convert each line to a Row.
lines = sc.textFile("data/people.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
teenagers.show()


+------+
|  name|
+------+
|Justin|
+------+

