# Spark DataFrames and SQL Basics Part I

## 1. Spark DataFrames

In [None]:
#for DataFrame we use a SparkSession instead of a SparkContext
from pyspark.sql import SparkSession

# SparkSession uses the "builder" syntax
spark = SparkSession.builder.master('local').appName('Week 6').getOrCreate()

In [None]:
import pandas as pd
#"row" storage
row_table = [{"col1": 3, "col2": 6.23, "col3": 8.51},
             {"col1": 5, "col2": 4.2, "col3": 7.45},
             {"col1": 4, "col2": 6.8, "col3": 9.2},
             {"col1": 6, "col2": 7.8, "col3": 10.5}]

pandas_df = pd.DataFrame(row_table)
pandas_df.head()

These examples are very similar in spirit to `sc.parallelize` for RDDs.  We are pushing small amounts of data up into Spark, usually for practice purposes.

### from list of dicts

In [None]:
from pyspark.sql import Row

#spark defaults to row-based, so we have to feed it row-by-row
df = spark.createDataFrame([Row(col1=3, col2=6.23, col3=8.51), Row(col1=5, col2=4.2, col3=7.45), Row(col1=4, col2=6.8, col3=9.2), Row(col1=6, col2=7.8, col3=10.5)])
df.collect()

### from `pandas`

We can also create from a `pandas` DataFrame

In [None]:
df = spark.createDataFrame(pandas_df)
df.collect()

### from list of tuples

Another easy way to create a DataFrame is from a list of tuples, passing the column names explicitly:

In [None]:
df = spark.createDataFrame([(3, 6.23, 8.51), (5, 4.2, 7.45), (4, 6.8, 9.2), (6, 7.8, 10.5)], ["col1", "col2", "col3"])
df.collect()

### from RDD

RDDs look very similar to a list of tuples, so it shouldn't surprise you that we can create a DataFrame from an RDD using almost exactly the same syntax.

In [None]:
#we can get the underlying SparkContext in order to play with RDDs
sc = spark.sparkContext

In [None]:
rdd = sc.parallelize([(3, 6.23, 8.51), (5, 4.2, 7.45), (4, 6.8, 9.2), (6, 7.8, 10.5)])

In [None]:
df = spark.createDataFrame(rdd, ["col1", "col2", "col3"])
df.collect()

### to RDD

We can also convert a DataFrame back to an RDD (but an RDD of `Row`s).

`Row` and `Column` are classes out of which the `DataFrame` class is built.

In [None]:
rdd2 = df.rdd
rdd2.collect()

### to `pandas`

We can convert back to a `pandas` DataFrame (this brings all the data back to the driver, basically like a `.collect()`, so be careful)

In [None]:
pandas_df2 = df.toPandas()
pandas_df2

## Look at content: take() vs head() vs show()

In [None]:
#same as the method for RDD
df.take(3)

In [None]:
#same as the head() method in pandas
df.head(3)

In [None]:
df.show(3)

In [None]:
df.toPandas().head()

### Basic DataFrame Operations

In [None]:
#rename column
df = df.withColumnRenamed("col3", "col3a")
df.show()

In [None]:
#create a new column
df = df.withColumn("col3b", df['col3a']/2)
df.show()

In [None]:
#filter data
df2 = df.filter(df['col3a'] >= 8)
df2.show()

In [None]:
#can also use where()
df2 = df.where(df['col3a'] >= 8)
df2.show()

## 2. DataFrame Schemas

In all of the examples above the schema was *inferred*.  Spark just looked at the data and decided what type it should be.  Spark might make a mistake.

In [None]:
df = spark.createDataFrame([(3, 6.23, 8.51), (5, 4.2, 7.45), (4, 6.8, 9.2), (6, 7.8, 10.5)], ["col1", "col2", "col3"])
#check the type of columns
df.dtypes

In [None]:
#call the schema of dataframe
df.printSchema()

What if I intended for `col1` to not get very large?  It might be more space efficient for me to store it as a single `byte`.
We can specify a *schema*.

In [None]:
from pyspark.sql.types import StructType, StructField, ByteType, DoubleType

#the True arguments specify whether or not the data is allowed to be missing (null)
schema = StructType([StructField("col1", ByteType(), True),
                     StructField("col2", DoubleType(), True),
                     StructField("col3", DoubleType(), True)])

In [None]:
df_from_schema = spark.createDataFrame([(3, 6.23, 8.51), (5, 4.2, 7.45), (4, 6.8, 9.2), (6, 7.8, 10.5)], schema)
df_from_schema.collect()

In [None]:
df_from_schema.dtypes

In [None]:
df_from_schema.printSchema()

## 3. DataFrame and SQL
Let's create a DataFrame by generating the data.  In this case, we'll first create the `JsonRDD` RDD and then convert it into a DataFrame when we're reading `JsonRDD` using `spark.read.json`.

In [None]:
#generate our own JSON data 
JsonRDD = sc.parallelize((""" 
  { "id": "123",
    "name": "Katie",
    "age": 19,
    "score": 33.56,
    "eyeColor": "brown"
  }""",
   """{
    "id": "234",
    "name": "Michael",
    "age": 22,
    "score": 41.21,
    "eyeColor": "green"
  }""", 
  """{
    "id": "345",
    "name": "Simone",
    "age": 23,
    "score": 28.45,
    "eyeColor": "blue"
  }""",
  """{
    "id": "456",
    "name": "Alan",
    "age": 20,
    "score": 36.53,
    "eyeColor": "brown"
  }""")
)

In [None]:
#create dataframe
swimmersJSON = spark.read.json(JsonRDD)

In [None]:
#create a temporary table
swimmersJSON.createOrReplaceTempView("swimmersJSON")

In [None]:
swimmersJSON.show()

In [None]:
#SQL Query
spark.sql("select * from swimmersJSON").collect()

In [None]:
#SQL Query
spark.sql("select * from swimmersJSON where score > 30").collect()

#### Inferring the Schema Using Reflection
Note that Spark is inferring the schema using reflection; i.e. it automaticlaly determines the schema of the data based on reviewing the JSON data.

In [None]:
# Print the schema
swimmersJSON.printSchema()

Notice that Spark was able to determine infer the schema. But now we want a different schema.

In this case, let's specify the schema for a `CSV` format text file.

In [None]:
from pyspark.sql.types import *

#generate our own CSV data 
stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 33.56, 'brown'), (234, 'Michael', 22, 41.21, 'green'), (345, 'Simone', 23, 28.45, 'blue'), (456, 'Alan', 20, 36.53, 'brown')])

# The schema is encoded in a string, using StructType we define the schema using various pyspark.sql.types
schemaString = "id name age eyeColor"
schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("score", DoubleType(), True),
    StructField("eyeColor", StringType(), True)
])

#apply the schema to the RDD and create dataframe
swimmers = spark.createDataFrame(stringCSVRDD, schema)

#creates a temporary view from the dataframe
swimmers.createOrReplaceTempView("swimmers")

In [None]:
#print the schema
swimmers.printSchema()

In [None]:
spark.sql("select * from swimmers").collect()

## 4. Querying with SQL
With DataFrames, you can start writing your queries using `Spark SQL`.

In [None]:
#execute SQL Query and return the data
spark.sql("select * from swimmers").show()

Let's get the row count:

In [None]:
#get count of rows in SQL
spark.sql("select count(*) from swimmers").show()

In [None]:
#query id and age for swimmers with age = 22 in SQL
spark.sql("select id, age from swimmers where age = 22").show()

In [None]:
#query name and eye color for swimmers with eye color starting with the letter 'b'
spark.sql("select name, eyeColor from swimmers where eyeColor like 'b%'").show()

## 5. Querying with the DataFrame API
With DataFrames, you can start writing your queries using the DataFrame API

In [None]:
# Show the values 
swimmers.show()

In [None]:
# Using Databricks `display` command to view the data easier
display(swimmers)

In [None]:
# Get count of rows
swimmers.count()

In [None]:
#get the id, age where age = 22
swimmers.select("id", "age").filter("age = 22").show()

In [None]:
#same, but with where()
swimmers.select("id", "age").where("age = 22").show()

In [None]:
#get the name, eyeColor where eyeColor like 'b%'
swimmers.select("name", "eyeColor").filter("eyeColor like 'b%'").show()

In [None]:
#query id and age for swimmers with age = 22 via DataFrame API in another way
swimmers.select(swimmers.id, swimmers.age).filter(swimmers.age == 22).show()

# Quick Exercise

## 1. Obtain the highest score with SQL query

## 2. Obtain players with scores over 30 and with brown eyes