# PySpark SparkSQL Group By


In [1]:
%%html
<style>
table {float:left}
</style>

In [2]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
</style>

In [3]:
import os
import sys
import gc

#  Environemnt Variables

## Hadoop

In [None]:
os.environ['HADOOP_CONF_DIR'] = "/opt/hadoop/hadoop-3.2.2/etc/hadoop"

In [None]:
%%bash
export HADOOP_CONF_DIR="/opt/hadoop/hadoop-3.2.2/etc/hadoop"
ls $HADOOP_CONF_DIR | head -n 5

## PYTHONPATH

Refer to the **pyspark** modules to load from the ```$SPARK_HOME/python/lib``` in the Spark installation.

* [PySpark Getting Started](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)

> Ensure the SPARK_HOME environment variable points to the directory where the tar file has been extracted. Update PYTHONPATH environment variable such that it can find the PySpark and Py4J under SPARK_HOME/python/lib. One example of doing this is shown below:

```
export PYTHONPATH=$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH
```

Alternatively install **pyspark** with pip or conda locally which installs the Spark runtime libararies (for standalone).

* [Can PySpark work without Spark?](https://stackoverflow.com/questions/51728177/can-pyspark-work-without-spark)

> As of v2.2, executing pip install pyspark will install Spark. If you're going to use Pyspark it's clearly the simplest way to get started. On my system Spark is installed inside my virtual environment (miniconda) at lib/python3.6/site-packages/pyspark/jars  
> PySpark has a Spark installation installed. If installed through pip3, you can find it with pip3 show pyspark. Ex. for me it is at ~/.local/lib/python3.8/site-packages/pyspark. This is a standalone configuration so it can't be used for managing clusters like a full Spark installation.

In [5]:
# os.environ['PYTHONPATH'] = "/opt/spark/spark-3.1.2/python/lib/py4j-0.10.9-src.zip:/opt/spark/spark-3.1.2/python/lib/pyspark.zip"
sys.path.extend([
    "/opt/spark/spark-3.1.2/python/lib/py4j-0.10.9-src.zip",
    "/opt/spark/spark-3.1.2/python/lib/pyspark.zip"
])

## PySpark packages

Execute after the PYTHONPATH setup.

In [21]:
import pyspark.sql 
from pyspark.sql.types import *
from pyspark.sql.functions import (
    col,
    avg,
    stddev,
    isnan,
    to_date,
    to_timestamp
)

# Data

Student schema from [Oracle SQL by Example](https://learning.oreilly.com/library/view/oracle-sql-by/9780137047345/ch06.html) located in ```./data/student```. 

In [10]:
%%bash
cd ./data/student
unzip -o student.zip

Archive:  student.zip
  inflating: COURSE_DATA_TABLE.csv   
  inflating: COURSE_REVENUE_DATA_TABLE.csv  
  inflating: EMPLOYEE_DATA_TABLE.csv  
  inflating: ENROLLMENT_DATA_TABLE.csv  
  inflating: GRADE_DATA_TABLE.csv    
  inflating: GRADE_TYPE_DATA_TABLE.csv  
  inflating: INSTRUCTOR_DATA_TABLE.csv  
  inflating: SECTION_DATA_TABLE.csv  
  inflating: SECTION_HISTORY_DATA_TABLE.csv  
  inflating: STUDENT_DATA_TABLE.csv  
  inflating: ZIPCODE_DATA_TABLE.csv  
COURSE_DATA_TABLE.csv
COURSE_REVENUE_DATA_TABLE.csv
EMPLOYEE_DATA_TABLE.csv
ENROLLMENT_DATA_TABLE.csv
GRADE_DATA_TABLE.csv
GRADE_TYPE_DATA_TABLE.csv
INSTRUCTOR_DATA_TABLE.csv
SECTION_DATA_TABLE.csv
SECTION_HISTORY_DATA_TABLE.csv
STUDENT_DATA_TABLE.csv
student.zip
ZIPCODE_DATA_TABLE.csv


In [11]:
%%bash
cd data/student/
hdfs dfs -mkdir -p student
hdfs dfs -put -f *.csv student

rm -rf *.csv

---
# Spark Session


In [12]:
from pyspark.sql import SparkSession

In [13]:
spark = SparkSession.builder\
    .master('yarn') \
    .config('spark.submit.deployMode', 'client') \
    .config('spark.debug.maxToStringFields', 100) \
    .config('spark.executor.memory', '2g') \
    .getOrCreate()

2022-02-19 14:34:40,651 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-02-19 14:34:45,032 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [14]:
NUM_CORES = 4
NUM_PARTITIONS = 3

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

# Stduent schema CSV 

* [SparkSQL CSV Files](https://spark.apache.org/docs/latest/sql-data-sources-csv.html)

> Spark SQL provides spark.read().csv("file_name") to read a file or directory of files in CSV format into Spark DataFrame, and dataframe.write().csv("path") to write to a CSV file. Function option() can be used to customize the behavior of reading or writing.

[SparkSession.read()](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/SparkSession.html#read--) returns [DataFrameReader](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/DataFrameReader.html) instance which has [option](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/DataFrameReader.html#option-java.lang.String-boolean-) method by which we can specify CSV options.

The options are listed in [Data Source Option](https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option)

In [116]:
course = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/COURSE_DATA_TABLE.csv")\
    .withColumn("CREATED_DATE", to_date(col('CREATED_DATE'), "yy-MMM-dd"))\
    .withColumn("MODIFIED_DATE", to_date(col('MODIFIED_DATE'), "yy-MMM-dd"))

course.printSchema()
course.createOrReplaceTempView("course")
course.show(3)

root
 |-- COURSE_NO: integer (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- COST: integer (nullable = true)
 |-- PREREQUISITE: integer (nullable = true)
 |-- CREATED_BY: string (nullable = true)
 |-- CREATED_DATE: date (nullable = true)
 |-- MODIFIED_BY: string (nullable = true)
 |-- MODIFIED_DATE: date (nullable = true)

+---------+--------------------+----+------------+----------+------------+-----------+-------------+
|COURSE_NO|         DESCRIPTION|COST|PREREQUISITE|CREATED_BY|CREATED_DATE|MODIFIED_BY|MODIFIED_DATE|
+---------+--------------------+----+------------+----------+------------+-----------+-------------+
|       10| Technology Concepts|1195|        null|  DSCHERER|  2029-03-07|   ARISCHER|   2005-04-07|
|       20|Intro to Informat...|1195|        null|  DSCHERER|  2029-03-07|   ARISCHER|   2005-04-07|
|       25|Intro to Programming|1195|         140|  DSCHERER|  2029-03-07|   ARISCHER|   2005-04-07|
+---------+--------------------+----+------------+-

In [117]:
section = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/SECTION_DATA_TABLE.csv")\
    .withColumn("CREATED_DATE", to_date(col('CREATED_DATE'), "yy-MMM-dd"))\
    .withColumn("MODIFIED_DATE", to_date(col('MODIFIED_DATE'), "yy-MMM-dd"))

section.printSchema()
section.createOrReplaceTempView("section")
section.show(3)

root
 |-- SECTION_ID: integer (nullable = true)
 |-- COURSE_NO: integer (nullable = true)
 |-- SECTION_NO: integer (nullable = true)
 |-- START_DATE_TIME: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- INSTRUCTOR_ID: integer (nullable = true)
 |-- CAPACITY: integer (nullable = true)
 |-- CREATED_BY: string (nullable = true)
 |-- CREATED_DATE: date (nullable = true)
 |-- MODIFIED_BY: string (nullable = true)
 |-- MODIFIED_DATE: date (nullable = true)

+----------+---------+----------+---------------+--------+-------------+--------+----------+------------+-----------+-------------+
|SECTION_ID|COURSE_NO|SECTION_NO|START_DATE_TIME|LOCATION|INSTRUCTOR_ID|CAPACITY|CREATED_BY|CREATED_DATE|MODIFIED_BY|MODIFIED_DATE|
+----------+---------+----------+---------------+--------+-------------+--------+----------+------------+-----------+-------------+
|        79|      350|         3|      14-APR-07|    L509|          107|      25|  CBRENNAN|  2002-01-07|   CBRENNAN|   2002-0

In [118]:
instructor = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/INSTRUCTOR_DATA_TABLE.csv")\
    .withColumn("CREATED_DATE", to_date(col('CREATED_DATE'), "yy-MMM-dd"))\
    .withColumn("MODIFIED_DATE", to_date(col('MODIFIED_DATE'), "yy-MMM-dd"))

instructor.printSchema()
instructor.createOrReplaceTempView("instructor")
instructor.show(3)

root
 |-- INSTRUCTOR_ID: integer (nullable = true)
 |-- SALUTATION: string (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- STREET_ADDRESS: string (nullable = true)
 |-- ZIP: integer (nullable = true)
 |-- PHONE: integer (nullable = true)
 |-- CREATED_BY: string (nullable = true)
 |-- CREATED_DATE: date (nullable = true)
 |-- MODIFIED_BY: string (nullable = true)
 |-- MODIFIED_DATE: date (nullable = true)

+-------------+----------+----------+---------+--------------+-----+----------+----------+------------+-----------+-------------+
|INSTRUCTOR_ID|SALUTATION|FIRST_NAME|LAST_NAME|STREET_ADDRESS|  ZIP|     PHONE|CREATED_BY|CREATED_DATE|MODIFIED_BY|MODIFIED_DATE|
+-------------+----------+----------+---------+--------------+-----+----------+----------+------------+-----------+-------------+
|          101|        Mr|   Fernand|    Hanks| 100 East 87th|10015|2125551212|  ESILVEST|  2002-01-07|   ESILVEST|   2002-01-07|
|          10

In [126]:
student = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/STUDENT_DATA_TABLE.csv")\
    .withColumn("REGISTRATION_DATE", to_date(col('REGISTRATION_DATE'), "yy-MMM-dd"))\
    .withColumn("CREATED_DATE",      to_date(col('CREATED_DATE'), "yy-MMM-dd"))\
    .withColumn("MODIFIED_DATE",     to_date(col('MODIFIED_DATE'), "yy-MMM-dd"))

student.cache()
student.printSchema()
student.createOrReplaceTempView("student")
student.show(3)

root
 |-- STUDENT_ID: integer (nullable = true)
 |-- SALUTATION: string (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- STREET_ADDRESS: string (nullable = true)
 |-- ZIP: integer (nullable = true)
 |-- PHONE: string (nullable = true)
 |-- EMPLOYER: string (nullable = true)
 |-- REGISTRATION_DATE: date (nullable = true)
 |-- CREATED_BY: string (nullable = true)
 |-- CREATED_DATE: date (nullable = true)
 |-- MODIFIED_BY: string (nullable = true)
 |-- MODIFIED_DATE: date (nullable = true)

+----------+----------+----------+---------+------------------+-----+------------+---------------+-----------------+-----------+------------+-----------+-------------+
|STUDENT_ID|SALUTATION|FIRST_NAME|LAST_NAME|    STREET_ADDRESS|  ZIP|       PHONE|       EMPLOYER|REGISTRATION_DATE| CREATED_BY|CREATED_DATE|MODIFIED_BY|MODIFIED_DATE|
+----------+----------+----------+---------+------------------+-----+------------+---------------+----------------

In [125]:
enrollment = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/ENROLLMENT_DATA_TABLE.csv")\
    .withColumn("CREATED_DATE", to_date(col('CREATED_DATE'), "yy-MMM-dd"))\
    .withColumn("MODIFIED_DATE", to_date(col('MODIFIED_DATE'), "yy-MMM-dd"))

enrollment.cache()
enrollment.printSchema()
enrollment.createOrReplaceTempView("enrollment")
enrollment.show(3)

root
 |-- STUDENT_ID: integer (nullable = true)
 |-- SECTION_ID: integer (nullable = true)
 |-- ENROLL_DATE: string (nullable = true)
 |-- FINAL_GRADE: integer (nullable = true)
 |-- CREATED_BY: string (nullable = true)
 |-- CREATED_DATE: date (nullable = true)
 |-- MODIFIED_BY: string (nullable = true)
 |-- MODIFIED_DATE: date (nullable = true)

+----------+----------+-----------+-----------+----------+------------+-----------+-------------+
|STUDENT_ID|SECTION_ID|ENROLL_DATE|FINAL_GRADE|CREATED_BY|CREATED_DATE|MODIFIED_BY|MODIFIED_DATE|
+----------+----------+-----------+-----------+----------+------------+-----------+-------------+
|       215|       146|  13-FEB-07|       null|  DSCHERER|  2014-12-07|   BROSENZW|   2005-01-07|
|       215|       156|  13-FEB-07|       null|  DSCHERER|  2014-12-07|   BROSENZW|   2005-01-07|
|       216|       154|  13-FEB-07|       null|  DSCHERER|  2014-12-07|   BROSENZW|   2005-01-07|
+----------+----------+-----------+-----------+----------+-----

In [121]:
grade = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/GRADE_DATA_TABLE.csv")\
    .withColumn("CREATED_DATE", to_date(col('CREATED_DATE'), "yy-MMM-dd"))\
    .withColumn("MODIFIED_DATE", to_date(col('MODIFIED_DATE'), "yy-MMM-dd"))

grade.printSchema()
grade.createOrReplaceTempView("grade")
grade.show(3)

root
 |-- STUDENT_ID: integer (nullable = true)
 |-- SECTION_ID: integer (nullable = true)
 |-- GRADE_TYPE_CODE: string (nullable = true)
 |-- GRADE_CODE_OCCURRENCE: integer (nullable = true)
 |-- NUMERIC_GRADE: integer (nullable = true)
 |-- COMMENTS: string (nullable = true)
 |-- CREATED_BY: string (nullable = true)
 |-- CREATED_DATE: date (nullable = true)
 |-- MODIFIED_BY: string (nullable = true)
 |-- MODIFIED_DATE: date (nullable = true)

+----------+----------+---------------+---------------------+-------------+--------+----------+------------+-----------+-------------+
|STUDENT_ID|SECTION_ID|GRADE_TYPE_CODE|GRADE_CODE_OCCURRENCE|NUMERIC_GRADE|COMMENTS|CREATED_BY|CREATED_DATE|MODIFIED_BY|MODIFIED_DATE|
+----------+----------+---------------+---------------------+-------------+--------+----------+------------+-----------+-------------+
|       111|       133|             PA|                    6|           80|    null|  CBRENNAN|  2011-02-07|     JAYCAF|   2011-02-07|
|       111

Need to make sure 31-DEC-98 is converted to 1998-12-31, not 2098-12-31.

* [spark to_date function - how to convert 31-DEC-98 to 1998-12-31 not 2098-12-31](https://stackoverflow.com/questions/71182230)

In [122]:
grade_type = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/GRADE_TYPE_DATA_TABLE.csv")\
    .withColumn("CREATED_DATE", to_date(col('CREATED_DATE'), "dd-MMM-yy"))\
    .withColumn("MODIFIED_DATE", to_date(col('MODIFIED_DATE'), "dd-MMM-yy"))

grade_type.printSchema()
grade_type.createOrReplaceTempView("grade_type")
grade_type.show(3)

root
 |-- GRADE_TYPE_CODE: string (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- CREATED_BY: string (nullable = true)
 |-- CREATED_DATE: date (nullable = true)
 |-- MODIFIED_BY: string (nullable = true)
 |-- MODIFIED_DATE: date (nullable = true)

+---------------+-----------+----------+------------+-----------+-------------+
|GRADE_TYPE_CODE|DESCRIPTION|CREATED_BY|CREATED_DATE|MODIFIED_BY|MODIFIED_DATE|
+---------------+-----------+----------+------------+-----------+-------------+
|             FI|      Final|  MCAFFREY|  2098-12-31|   MCAFFREY|   2098-12-31|
|             HM|   Homework|  MCAFFREY|  2098-12-31|   MCAFFREY|   2098-12-31|
|             MT|    Midterm|  MCAFFREY|  2098-12-31|   MCAFFREY|   2098-12-31|
+---------------+-----------+----------+------------+-----------+-------------+
only showing top 3 rows



In [123]:
zipcode = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/ZIPCODE_DATA_TABLE.csv")\
    .withColumn("CREATED_DATE", to_date(col('CREATED_DATE'), "dd-MMM-yy"))\
    .withColumn("MODIFIED_DATE", to_date(col('MODIFIED_DATE'), "dd-MMM-yy"))

zipcode.printSchema()
zipcode.createOrReplaceTempView("zipcode")
zipcode.show(3)

root
 |-- ZIP: integer (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- CREATED_BY: string (nullable = true)
 |-- CREATED_DATE: date (nullable = true)
 |-- MODIFIED_BY: string (nullable = true)
 |-- MODIFIED_DATE: date (nullable = true)

+-----+----------------+-----+----------+------------+-----------+-------------+
|  ZIP|            CITY|STATE|CREATED_BY|CREATED_DATE|MODIFIED_BY|MODIFIED_DATE|
+-----+----------------+-----+----------+------------+-----------+-------------+
|11101|Long Island City|   NY|  AMORRISO|  2007-08-03|   AMORRISO|   2007-11-24|
|11102|         Astoria|   NY|  AMORRISO|  2007-08-03|   AMORRISO|   2007-11-24|
|11103|         Astoria|   NY|  AMORRISO|  2007-08-03|   AMORRISO|   2007-11-24|
+-----+----------------+-----+----------+------------+-----------+-------------+
only showing top 3 rows



In [124]:
employee = spark.read\
    .option("header", True)\
    .option("nullValue", "")\
    .option("inferSchema", True)\
    .csv("student/EMPLOYEE_DATA_TABLE.csv")

employee.printSchema()
employee.createOrReplaceTempView("employee")
employee.show(3)

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- NAME: string (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- TITLE: string (nullable = true)

+-----------+------+------+---------+
|EMPLOYEE_ID|  NAME|SALARY|    TITLE|
+-----------+------+------+---------+
|          1|  John|  1000|  Analyst|
|          2|  Mary|  2000|  Manager|
|          3|Stella|  5000|President|
+-----------+------+------+---------+
only showing top 3 rows



---
# Group BY and aggregation

## Course prerequisite counts

SparkSQL requires explicit assurance that the scalar-subquery returns only one record 

* [Correlated scalar subqueries must be Aggregated](https://stackoverflow.com/a/46271504/4281353)

> when Spark SQL Analyzer/Catalyst can't make 100% sure just by looking at the SQL statement that the sub-query only returns a single row, the exception is thrown.
> If you are sure that your subquery only gives a single row you can use one of the following aggregation standard functions, so Spark Analyzer is happy:
>
> * first
> * avg
> * max
> * min

* [SparkSQL - How to make scalar subquery work without FIRST/MIN/MAX/AVG](https://stackoverflow.com/questions/71182919)

> Couldn't find first(description) if FIRST is specified in SparkSQL

In [224]:
query="""
SELECT 
    c.*,
    (SELECT FIRST(e.description) FROM course e WHERE e.course_no = c.prerequisite) as course_name
FROM (
    SELECT
        prerequisite AS prerequisite,
        COUNT(*) as cnt
    FROM
        course c
    WHERE 
        c.prerequisite IS NOT NULL
    GROUP BY 
        c.prerequisite
    ORDER BY
        prerequisite
) c
ORDER BY
    prerequisite
"""
spark.sql(query).show(truncate=False)

+------------+---+-----------------------------+
|prerequisite|cnt|course_name                  |
+------------+---+-----------------------------+
|10          |1  |Technology Concepts          |
|20          |5  |Intro to Information Systems |
|25          |2  |Intro to Programming         |
|80          |2  |Programming Techniques       |
|120         |1  |Intro to Java Programming    |
|122         |2  |Intermediate Java Programming|
|125         |1  |Java Developer I             |
|130         |2  |Intro to Unix                |
|132         |1  |Basics of Unix Admin         |
|134         |1  |Advanced Unix Admin          |
|140         |1  |Systems Analysis             |
|204         |1  |Intro to SQL                 |
|220         |1  |PL/SQL Programming           |
|310         |2  |Operating Systems            |
|350         |2  |Java Developer II            |
|420         |1  |Database System Principles   |
+------------+---+-----------------------------+



## Course gade statistics

In [172]:
query = """
SELECT
    s.course_no AS course_no,
    c.description as course_name,
    COUNT(g.numeric_grade) AS number_of_students, 
    ROUND(AVG(g.numeric_grade),2) AS mean, 
    ROUND(STDDEV(g.numeric_grade), 2) AS std
FROM 
    course c
    INNER JOIN section s ON s.course_no = c.course_no
    INNER JOIN grade g ON g.section_id = s.section_id
GROUP BY
    s.course_no, c.description
ORDER BY
    course_no ASC
"""
spark.sql(query).show(truncate=False)

+---------+-----------------------------+------------------+-----+----+
|course_no|course_name                  |number_of_students|mean |std |
+---------+-----------------------------+------------------+-----+----+
|10       |Technology Concepts          |11                |84.27|9.59|
|20       |Intro to Information Systems |66                |85.77|8.18|
|25       |Intro to Programming         |304               |86.04|7.66|
|100      |Hands-On Windows             |128               |87.23|7.81|
|120      |Intro to Java Programming    |207               |86.62|7.47|
|122      |Intermediate Java Programming|198               |86.86|7.66|
|124      |Advanced Java Programming    |80                |84.83|6.89|
|125      |Java Developer I             |92                |86.27|8.44|
|130      |Intro to Unix                |88                |86.82|8.03|
|132      |Basics of Unix Admin         |18                |89.28|9.86|
|134      |Advanced Unix Admin          |21                |86.9

## Student grardes

In [144]:
query="""
SELECT DISTINCT
    g.student_id,
    CONCAT(t.first_name, ' ', t.last_name) AS name,
    c.course_no,
    c.description,
    s.section_id,
    g.numeric_grade
FROM
    course c
    INNER JOIN section s ON s.course_no = c.course_no
    INNER JOIN grade g ON g.section_id = s.section_id
    INNER JOIN student t on g.student_id = t.student_id
"""
spark.sql(query).show(5, truncate=False)

+----------+--------------+---------+----------------------------+----------+-------------+
|student_id|name          |course_no|description                 |section_id|numeric_grade|
+----------+--------------+---------+----------------------------+----------+-------------+
|128       |Jeff Runyan   |10       |Technology Concepts         |80        |83           |
|235       |Michael Carcia|20       |Intro to Information Systems|83        |99           |
|235       |Michael Carcia|20       |Intro to Information Systems|83        |90           |
|158       |Roy Limate    |20       |Intro to Information Systems|84        |88           |
|238       |Roger Snow    |25       |Intro to Programming        |85        |92           |
+----------+--------------+---------+----------------------------+----------+-------------+
only showing top 5 rows



## Students whose grades are avove course average

In [160]:
query="""
WITH course_grade_average AS (
    SELECT DISTINCT
        c.course_no,
        AVG(g.numeric_grade) AS grade_average
    FROM
        course c
        INNER JOIN section s ON s.course_no = c.course_no
        INNER JOIN grade g ON g.section_id = s.section_id
    GROUP BY c.course_no
    ORDER BY c.course_no ASC
)

SELECT DISTINCT
    g.student_id,
    CONCAT(t.first_name, ' ', t.last_name) AS name,
    c.course_no,
    c.description,
    s.section_id,
    g.numeric_grade as grade,
    ROUND(a.grade_average, 2) as average
FROM
    course c
    INNER JOIN section s ON s.course_no = c.course_no
    INNER JOIN grade g ON g.section_id = s.section_id
    INNER JOIN student t ON g.student_id = t.student_id
    INNER JOIN course_grade_average a ON c.course_no = a.course_no
WHERE
    g.numeric_grade > a.grade_average
    AND g.grade_type_code = 'FI'
ORDER BY 
    course_no,
    grade DESC
"""
spark.sql(query).show(truncate=False)

+----------+-------------------+---------+----------------------------+----------+-----+-------+
|student_id|name               |course_no|description                 |section_id|grade|average|
+----------+-------------------+---------+----------------------------+----------+-----+-------+
|128       |Jeff Runyan        |10       |Technology Concepts         |80        |91   |84.27  |
|124       |Daniel Wicelinski  |20       |Intro to Information Systems|83        |99   |85.77  |
|199       |J. Segall          |20       |Intro to Information Systems|84        |99   |85.77  |
|104       |Laetia Enison      |20       |Intro to Information Systems|81        |92   |85.77  |
|103       |J. Landry          |20       |Intro to Information Systems|81        |91   |85.77  |
|235       |Michael Carcia     |20       |Intro to Information Systems|83        |90   |85.77  |
|158       |Roy Limate         |20       |Intro to Information Systems|84        |88   |85.77  |
|123       |Pierre Radicola   

## Students who enrolled more than two courses

In [228]:
query = """
SELECT
    student_id,
    COUNT(section_id)
FROM 
    enrollment
GROUP BY
    student_id
HAVING 
    COUNT(section_id) > 2
ORDER BY
    student_id
"""
spark.sql(query).show(truncate=False)

+----------+-----------------+
|student_id|count(section_id)|
+----------+-----------------+
|124       |4                |
|184       |3                |
|214       |4                |
|215       |3                |
|232       |3                |
|238       |3                |
|250       |3                |
+----------+-----------------+



---
# Stop Spark Session

In [229]:
spark.stop()



# Cleanup

In [230]:
del spark
gc.collect()

11077