## [API](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)

## SparkSession

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName('Apache Spark test') \
    .config("spark.sql.shuffle.partitions", "500") \
    .getOrCreate()

In [26]:
spark

## Data 

In [27]:
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100])])\
    .toDF("id", "name", "graduate_program", "spark_status")

graduateProgram = spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D.", "EECS", "UC Berkeley")])\
    .toDF("id", "degree", "department", "school")

sparkStatus = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")])\
    .toDF("id", "status")


In [28]:
person.createOrReplaceTempView('person')
graduateProgram.createOrReplaceTempView('graduateProgram')
sparkStatus.createOrReplaceTempView('sparkStatus')

In [29]:
person.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- graduate_program: long (nullable = true)
 |-- spark_status: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [30]:
person.show(2, False)

+---+-------------+----------------+---------------+
|id |name         |graduate_program|spark_status   |
+---+-------------+----------------+---------------+
|0  |Bill Chambers|0               |[100]          |
|1  |Matei Zaharia|1               |[500, 250, 100]|
+---+-------------+----------------+---------------+
only showing top 2 rows



## Left Inner Join

In [31]:
join_expression = person['graduate_program'] == graduateProgram['id']

person.join(graduateProgram, join_expression) \
    .select('name', "degree", "department", "school") \
    .show(5, False)

+----------------+-------+---------------------+-----------+
|name            |degree |department           |school     |
+----------------+-------+---------------------+-----------+
|Matei Zaharia   |Ph.D.  |EECS                 |UC Berkeley|
|Michael Armbrust|Ph.D.  |EECS                 |UC Berkeley|
|Bill Chambers   |Masters|School of Information|UC Berkeley|
+----------------+-------+---------------------+-----------+



In [32]:
spark.sql('''
    SELECT name, degree, department, school
    FROM person JOIN graduateProgram
    ON person.graduate_program = graduateProgram.id
''') \
    .show(5, False)

+----------------+-------+---------------------+-----------+
|name            |degree |department           |school     |
+----------------+-------+---------------------+-----------+
|Matei Zaharia   |Ph.D.  |EECS                 |UC Berkeley|
|Michael Armbrust|Ph.D.  |EECS                 |UC Berkeley|
|Bill Chambers   |Masters|School of Information|UC Berkeley|
+----------------+-------+---------------------+-----------+



## Outer Joins

In [33]:
for join_type in ['outer', 'left_outer', 'right_outer']:
    print(join_type)
    person.join(graduateProgram, join_expression, join_type) \
        .select('name', "degree", "department", "school") \
        .show(5, False)

outer
+----------------+-------+---------------------+-----------+
|name            |degree |department           |school     |
+----------------+-------+---------------------+-----------+
|Matei Zaharia   |Ph.D.  |EECS                 |UC Berkeley|
|Michael Armbrust|Ph.D.  |EECS                 |UC Berkeley|
|null            |Masters|EECS                 |UC Berkeley|
|Bill Chambers   |Masters|School of Information|UC Berkeley|
+----------------+-------+---------------------+-----------+

left_outer
+----------------+-------+---------------------+-----------+
|name            |degree |department           |school     |
+----------------+-------+---------------------+-----------+
|Matei Zaharia   |Ph.D.  |EECS                 |UC Berkeley|
|Michael Armbrust|Ph.D.  |EECS                 |UC Berkeley|
|Bill Chambers   |Masters|School of Information|UC Berkeley|
+----------------+-------+---------------------+-----------+

right_outer
+----------------+-------+---------------------+------

## Left Semi Joins

In [34]:
# filter the left df based on the right df 

graduateProgram.join(person, join_expression, 'left_semi') \
    .select("id", "degree", "department", "school") \
    .show(5, False)

+---+-------+---------------------+-----------+
|id |degree |department           |school     |
+---+-------+---------------------+-----------+
|1  |Ph.D.  |EECS                 |UC Berkeley|
|0  |Masters|School of Information|UC Berkeley|
+---+-------+---------------------+-----------+



## Left Anti Joins

In [35]:
graduateProgram.join(person, join_expression, 'left_anti') \
    .select("id", "degree", "department", "school") \
    .show(5, False)

+---+-------+----------+-----------+
|id |degree |department|school     |
+---+-------+----------+-----------+
|2  |Masters|EECS      |UC Berkeley|
+---+-------+----------+-----------+



## Join types

- shuffle join (two big tables)
- broadcast join (one of the dataframes is small enough to fit in the memory of each node)

In [36]:
person.join(graduateProgram, join_expression) \
    .explain()

== Physical Plan ==
*(5) SortMergeJoin [graduate_program#707L], [id#721L], Inner
:- *(2) Sort [graduate_program#707L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(graduate_program#707L, 500)
:     +- *(1) Project [_1#697L AS id#705L, _2#698 AS name#706, _3#699L AS graduate_program#707L, _4#700 AS spark_status#708]
:        +- *(1) Filter isnotnull(_3#699L)
:           +- Scan ExistingRDD[_1#697L,_2#698,_3#699L,_4#700]
+- *(4) Sort [id#721L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(id#721L, 500)
      +- *(3) Project [_1#713L AS id#721L, _2#714 AS degree#722, _3#715 AS department#723, _4#716 AS school#724]
         +- *(3) Filter isnotnull(_1#713L)
            +- Scan ExistingRDD[_1#713L,_2#714,_3#715,_4#716]


In [37]:
person.join(broadcast(graduateProgram), join_expression) \
    .explain()

== Physical Plan ==
*(2) BroadcastHashJoin [graduate_program#707L], [id#721L], Inner, BuildRight
:- *(2) Project [_1#697L AS id#705L, _2#698 AS name#706, _3#699L AS graduate_program#707L, _4#700 AS spark_status#708]
:  +- *(2) Filter isnotnull(_3#699L)
:     +- Scan ExistingRDD[_1#697L,_2#698,_3#699L,_4#700]
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true]))
   +- *(1) Project [_1#713L AS id#721L, _2#714 AS degree#722, _3#715 AS department#723, _4#716 AS school#724]
      +- *(1) Filter isnotnull(_1#713L)
         +- Scan ExistingRDD[_1#713L,_2#714,_3#715,_4#716]
