<h1>Spark SQL Inferred and Explicit Schemas</h1>
<h2>Demo 3</h2>

<hr>
<h2>Catalyst Notes</h2>
<ul>
    <li>Powers Data Frame operations and spark sql</li>
</ul>

<hr>
<h2>Setting up the Notebook</h2>
<ul>
    <li>Setting up import statements</li>
    <li>Setting up the spark session</li>
    <li>Reading the lines of a text file into a dataframe</li>
    <li>Parsing each line into columns</li>
</ul>

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.types import Row
from pyspark.sql.types import StructType, StructField, StringType, LongType

In [2]:
spark = SparkSession.builder\
                    .appName("Inferred and explicit shemas")\
                    .getOrCreate()

In [3]:
lines = sc.textFile("../datasets/students.txt")

In [4]:
lines.collect()

['Emily,44,55,78', 'Andy,47,34,89', 'Rick,55,78,55', 'Aaron,66,34,98']

In [5]:
parts = lines.map(lambda l: l.split(","))
parts.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

<hr>
<h2>Working with Types - Implicit</h2>

In [14]:
students = parts.map(lambda p: Row(name=p[0], math=int(p[1]), english=int(p[2]), science=int(p[3])))

In [15]:
students.collect()

[Row(english=55, math=44, name='Emily', science=78),
 Row(english=34, math=47, name='Andy', science=89),
 Row(english=78, math=55, name='Rick', science=55),
 Row(english=34, math=66, name='Aaron', science=98)]

In [16]:
schemaStudents = spark.createDataFrame(students)
schemaStudents.createOrReplaceTempView("students")

In [19]:
schemaStudents.columns

['english', 'math', 'name', 'science']

In [20]:
schemaStudents.schema

StructType(List(StructField(english,LongType,true),StructField(math,LongType,true),StructField(name,StringType,true),StructField(science,LongType,true)))

In [21]:
spark.sql("SELECT * FROM students").show()

+-------+----+-----+-------+
|english|math| name|science|
+-------+----+-----+-------+
|     55|  44|Emily|     78|
|     34|  47| Andy|     89|
|     78|  55| Rick|     55|
|     34|  66|Aaron|     98|
+-------+----+-----+-------+



<hr>
<h2>Working with Types - Explicit</h2>

In [22]:
parts.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [24]:
schemaString = "name math english science"

In [26]:
fields = [
    StructField('name', StringType(), True),
    StructField('math', LongType(), True),
    StructField('english', LongType(), True),
    StructField('science', LongType(), True)
]

In [27]:
schema = StructType(fields)

In [28]:
schemaStudents = spark.createDataFrame(parts, schema)

In [29]:
schemaStudents.columns

['name', 'math', 'english', 'science']

In [30]:
schemaStudents.schema

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))

In [31]:
spark.sql("SELECT * FROM students").show()

+-------+----+-----+-------+
|english|math| name|science|
+-------+----+-----+-------+
|     55|  44|Emily|     78|
|     34|  47| Andy|     89|
|     78|  55| Rick|     55|
|     34|  66|Aaron|     98|
+-------+----+-----+-------+

