# Aggregations and Grouping

## Prerrequisites

Install Spark and Java in VM

In [None]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.1
!wget -q https://apache.osuosl.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop2.tgz

In [None]:
ls -l # check the .tgz is there

total 267684
drwxr-xr-x 1 root root      4096 Dec  6 14:35 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 274099817 Oct 15 10:53 spark-3.3.1-bin-hadoop2.tgz


In [None]:
# unzip it
!tar xf spark-3.3.1-bin-hadoop2.tgz

In [None]:
!pip install -q findspark

In [None]:
!pip install py4j

# For maps
!pip install folium
!pip install plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 5.1 MB/s 
[?25hInstalling collected packages: py4j
Successfully installed py4j-0.10.9.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Define the environment

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop2"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [None]:
import findspark
findspark.init("spark-3.3.1-bin-hadoop2")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Aggregations and Grouping") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.3.1'

In [None]:
spark

In [None]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [None]:
# Import sql functions
from pyspark.sql.functions import *

Download datasets

In [None]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/movies.json -P /dataset

Read JSON file

---

In [None]:
moviesDF = spark.read \
    .option("inferSchema", True) \
    .json("/dataset/movies.json")

In [None]:
moviesDF.show(2, False)
print(moviesDF.schema.fields)
moviesDF.columns

+-------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+------+----------------------+------------+--------+---------------+
|Creative_Type|Director|Distributor|IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|Source|Title                 |US_DVD_Sales|US_Gross|Worldwide_Gross|
+-------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+------+----------------------+------------+--------+---------------+
|null         |null    |Gramercy   |6.1        |1071      |R          |null       |8000000          |12-Jun-98   |null                  |null            |null  |The Land Girls        |null        |146083  |146083         |
|null         |null    |Strand     |6.9        |207       |R          |Drama      |300000           |7-Aug-9

['Creative_Type',
 'Director',
 'Distributor',
 'IMDB_Rating',
 'IMDB_Votes',
 'MPAA_Rating',
 'Major_Genre',
 'Production_Budget',
 'Release_Date',
 'Rotten_Tomatoes_Rating',
 'Running_Time_min',
 'Source',
 'Title',
 'US_DVD_Sales',
 'US_Gross',
 'Worldwide_Gross']

## Examples

Count

In [None]:
# df rows counting, including NULLS
moviesDF.count()

3201

In [None]:
# using sql functions, NOT including NULLS
genresCountDF = moviesDF.select(count(col("Major_Genre")))
genresCountDF.show()

+------------------+
|count(Major_Genre)|
+------------------+
|              2926|
+------------------+



In [None]:
directorsCountDF = moviesDF.select(count(moviesDF.Director))
directorsCountDF.show()

+---------------+
|count(Director)|
+---------------+
|           1870|
+---------------+



In [None]:
moviesDF.select(count(moviesDF.Major_Genre).alias("countMajor"), count(moviesDF.Director)).show()

+----------+---------------+
|countMajor|count(Director)|
+----------+---------------+
|      2926|           1870|
+----------+---------------+



In [None]:
#using SQL syntax
moviesDF.select(expr("count(Director)")).show()
moviesDF.selectExpr("count(Director) as count").show()

+---------------+
|count(Director)|
+---------------+
|           1870|
+---------------+

+-----+
|count|
+-----+
| 1870|
+-----+



In [None]:
# using SQL
moviesDF.createOrReplaceTempView("movies")

In [None]:
spark.sql("select count(Director) from movies").show()

+---------------+
|count(Director)|
+---------------+
|           1870|
+---------------+



In [None]:
spark.sql("select count(Director) as countDirector, count(Major_Genre) from movies").show()

+-------------+------------------+
|countDirector|count(Major_Genre)|
+-------------+------------------+
|         1870|              2926|
+-------------+------------------+



Count Distinct

In [None]:
moviesDF.select(countDistinct(moviesDF.Major_Genre)).show()

+---------------------------+
|count(DISTINCT Major_Genre)|
+---------------------------+
|                         12|
+---------------------------+



In [None]:
spark.sql("select count(distinct Major_Genre) from movies").show()

+---------------------------+
|count(DISTINCT Major_Genre)|
+---------------------------+
|                         12|
+---------------------------+



Min and max

In [None]:
moviesDF.select(min(moviesDF.Production_Budget), max(moviesDF.Production_Budget)).show()

+----------------------+----------------------+
|min(Production_Budget)|max(Production_Budget)|
+----------------------+----------------------+
|                   218|             300000000|
+----------------------+----------------------+



In [None]:
spark.sql("select min(Production_Budget) from movies").show()

+----------------------+
|min(Production_Budget)|
+----------------------+
|                   218|
+----------------------+



Sum

In [None]:
moviesDF.select(sum(moviesDF.US_DVD_Sales).alias("salesUS")).show()
moviesDF.selectExpr("sum(US_DVD_Sales) as sales").show()

+-----------+
|    salesUS|
+-----------+
|19684472405|
+-----------+

+-----------+
|      sales|
+-----------+
|19684472405|
+-----------+



Average

In [None]:
moviesDF.select(avg(moviesDF.Production_Budget)).show()
spark.sql("select avg(Production_Budget) from movies").show()

+----------------------+
|avg(Production_Budget)|
+----------------------+
|    3.10691714484375E7|
+----------------------+

+----------------------+
|avg(Production_Budget)|
+----------------------+
|    3.10691714484375E7|
+----------------------+



Stats

In [None]:
moviesDF.select(mean(moviesDF.Rotten_Tomatoes_Rating)).show()
moviesDF.select(stddev(moviesDF.Rotten_Tomatoes_Rating)).show()

+---------------------------+
|avg(Rotten_Tomatoes_Rating)|
+---------------------------+
|          54.33692373976734|
+---------------------------+

+-----------------------------------+
|stddev_samp(Rotten_Tomatoes_Rating)|
+-----------------------------------+
|                  28.07659263787602|
+-----------------------------------+



### Grouping

---

In [None]:
countByGenreGF = moviesDF.groupBy(moviesDF.Major_Genre).count().orderBy("count")
countByGenreGF.show()

+-------------------+-----+
|        Major_Genre|count|
+-------------------+-----+
|Concert/Performance|    5|
|       Black Comedy|   36|
|            Western|   36|
|        Documentary|   43|
|            Musical|   53|
|    Romantic Comedy|  137|
|             Horror|  219|
|  Thriller/Suspense|  239|
|          Adventure|  274|
|               null|  275|
|             Action|  420|
|             Comedy|  675|
|              Drama|  789|
+-------------------+-----+



In [None]:
spark.sql("select Major_Genre, count(Major_Genre) as count from movies where Major_Genre is not null group by Major_Genre order by count").show()

+-------------------+-----+
|        Major_Genre|count|
+-------------------+-----+
|Concert/Performance|    5|
|       Black Comedy|   36|
|            Western|   36|
|        Documentary|   43|
|            Musical|   53|
|    Romantic Comedy|  137|
|             Horror|  219|
|  Thriller/Suspense|  239|
|          Adventure|  274|
|             Action|  420|
|             Comedy|  675|
|              Drama|  789|
+-------------------+-----+



In [None]:
avgRatingByGenreDF = moviesDF.groupBy(col("Major_Genre")).avg("IMDB_Rating").orderBy(col("avg(IMDB_Rating)").desc())
avgRatingByGenreDF.show()

+-------------------+------------------+
|        Major_Genre|  avg(IMDB_Rating)|
+-------------------+------------------+
|        Documentary| 6.997297297297298|
|            Western| 6.842857142857142|
|       Black Comedy|6.8187500000000005|
|              Drama| 6.773441734417339|
|               null|  6.50082644628099|
|            Musical|             6.448|
|  Thriller/Suspense| 6.360944206008582|
|          Adventure| 6.345019920318729|
|Concert/Performance|             6.325|
|             Action| 6.114795918367349|
|    Romantic Comedy| 5.873076923076922|
|             Comedy| 5.853858267716529|
|             Horror|5.6760765550239185|
+-------------------+------------------+



In [None]:
moviesDF.groupBy(col("Major_Genre")).agg(avg("IMDB_Rating") \
    .alias("avg")).orderBy(col("avg").desc()).show()

+-------------------+------------------+
|        Major_Genre|               avg|
+-------------------+------------------+
|        Documentary| 6.997297297297298|
|            Western| 6.842857142857142|
|       Black Comedy|6.8187500000000005|
|              Drama| 6.773441734417339|
|               null|  6.50082644628099|
|            Musical|             6.448|
|  Thriller/Suspense| 6.360944206008582|
|          Adventure| 6.345019920318729|
|Concert/Performance|             6.325|
|             Action| 6.114795918367349|
|    Romantic Comedy| 5.873076923076922|
|             Comedy| 5.853858267716529|
|             Horror|5.6760765550239185|
+-------------------+------------------+



In [None]:
aggregationsByGenreDF = moviesDF.groupBy("Major_Genre") \
    .agg(
        count("*").alias("N_Movies"),
        avg("IMDB_Rating").alias("rating")
    ) \
    .orderBy(col("rating").desc()).show()

+-------------------+--------+------------------+
|        Major_Genre|N_Movies|            rating|
+-------------------+--------+------------------+
|        Documentary|      43| 6.997297297297298|
|            Western|      36| 6.842857142857142|
|       Black Comedy|      36|6.8187500000000005|
|              Drama|     789| 6.773441734417339|
|               null|     275|  6.50082644628099|
|            Musical|      53|             6.448|
|  Thriller/Suspense|     239| 6.360944206008582|
|          Adventure|     274| 6.345019920318729|
|Concert/Performance|       5|             6.325|
|             Action|     420| 6.114795918367349|
|    Romantic Comedy|     137| 5.873076923076922|
|             Comedy|     675| 5.853858267716529|
|             Horror|     219|5.6760765550239185|
+-------------------+--------+------------------+



## Exercises
   1. Sum up all the worldwide profits of ALL the movies in the DF. Then sum the worldwide profits per director
   2. Count how many distinct directors we have
   3. Show the mean and standard deviation of US gross revenue for the movies (all the movies)
   4. Compute the average IMDB rating and the average US gross revenue PER DIRECTOR
   5. Sum up ALL the profits of ALL the movies in the DF. Then sum ALL the profits per director. Can you see null values? Why? How you can solve it?

Exercise 1

In [None]:
moviesDF.select(sum("Worldwide_Gross")).show()

+--------------------+
|sum(Worldwide_Gross)|
+--------------------+
|        272586820052|
+--------------------+



In [None]:
moviesDF.select(col("Director"), col("Worldwide_Gross")).groupBy("Director").sum("Worldwide_Gross").orderBy(col("Director")).show()

+--------------------+--------------------+
|            Director|sum(Worldwide_Gross)|
+--------------------+--------------------+
|                null|         47605151241|
|        Abel Ferrara|             1412799|
|          Adam McKay|           252379731|
|       Adam Shankman|           841310147|
|         Adrian Lyne|           648560696|
|     Adrienne Shelly|            22202180|
|      Akira Kurosawa|              320592|
|           Alan Alda|            42488161|
|      Alan J. Pakula|           140900000|
|         Alan Parker|           193906027|
|        Alan Rudolph|              178287|
|       Albert Brooks|            11614954|
|       Albert Hughes|            59329835|
|Alejandro Gonzale...|           216635372|
|       Alex Kendrick|            43629810|
|         Alex Proyas|           563716726|
|     Alexander Payne|           230472709|
|       Alexandre Aja|            69623713|
|      Alfonso Cuaron|           898639035|
|    Alfred Hitchcock|          

In [None]:
moviesDF.filter(col("Director") == "Akira Kurosawa").select(col("Director"), col("Worldwide_Gross")).show()

+--------------+---------------+
|      Director|Worldwide_Gross|
+--------------+---------------+
|Akira Kurosawa|          48856|
|Akira Kurosawa|         271736|
+--------------+---------------+



Exercise 2

In [None]:
moviesDF.select(countDistinct("Director")).show()

+------------------------+
|count(DISTINCT Director)|
+------------------------+
|                     550|
+------------------------+



In [None]:
moviesDF.columns

['Creative_Type',
 'Director',
 'Distributor',
 'IMDB_Rating',
 'IMDB_Votes',
 'MPAA_Rating',
 'Major_Genre',
 'Production_Budget',
 'Release_Date',
 'Rotten_Tomatoes_Rating',
 'Running_Time_min',
 'Source',
 'Title',
 'US_DVD_Sales',
 'US_Gross',
 'Worldwide_Gross']

Exercise 3

In [None]:
moviesDF.select(mean("US_Gross"), stddev("US_Gross")).show()

+--------------------+---------------------+
|       avg(US_Gross)|stddev_samp(US_Gross)|
+--------------------+---------------------+
|4.4002085163744524E7|  6.255531139066214E7|
+--------------------+---------------------+



Exercise 4

In [None]:
moviesDF.groupBy("Director").agg(avg("US_Gross"), avg("IMDB_Rating")).orderBy("Director").show()

+--------------------+--------------------+-----------------+
|            Director|       avg(US_Gross)| avg(IMDB_Rating)|
+--------------------+--------------------+-----------------+
|                null| 2.082582007471698E7|5.924237427864797|
|        Abel Ferrara|           1212799.0|              6.4|
|          Adam McKay|        1.16175143E8|              6.7|
|       Adam Shankman| 9.145074516666667E7|5.833333333333333|
|         Adrian Lyne|         6.1548874E7|6.219999999999999|
|     Adrienne Shelly|          1.909755E7|              7.2|
|      Akira Kurosawa|            160296.0|8.100000000000001|
|           Alan Alda|         4.2488161E7|              7.0|
|      Alan J. Pakula|         4.2885593E7|              5.8|
|         Alan Parker|2.7593491333333332E7|              6.8|
|        Alan Rudolph|            178287.0|              4.3|
|       Albert Brooks|         1.1614954E7|              5.5|
|       Albert Hughes|        2.96649175E7|              7.1|
|Alejand

Exercise 5

In [None]:
moviesDF.select((col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales")).alias("Total_Gross")).select(sum("Total_Gross")).show()

+----------------+
|sum(Total_Gross)|
+----------------+
|    139190135783|
+----------------+



In [None]:
moviesDF.groupBy("Director").agg(sum(col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales")).alias("Total_Gross")).orderBy("Director").show()

+--------------------+-----------+
|            Director|Total_Gross|
+--------------------+-----------+
|                null|23161808950|
|        Abel Ferrara|       null|
|          Adam McKay|  396065126|
|       Adam Shankman|  669875496|
|         Adrian Lyne|       null|
|     Adrienne Shelly|   64427279|
|      Akira Kurosawa|       null|
|           Alan Alda|       null|
|      Alan J. Pakula|       null|
|         Alan Parker|       null|
|        Alan Rudolph|       null|
|       Albert Brooks|       null|
|       Albert Hughes|       null|
|Alejandro Gonzale...|  201064882|
|       Alex Kendrick|  139250136|
|         Alex Proyas|  291267207|
|     Alexander Payne|       null|
|       Alexandre Aja|  131979381|
|      Alfonso Cuaron|  130347856|
|    Alfred Hitchcock|       null|
+--------------------+-----------+
only showing top 20 rows



In [None]:
moviesDF.filter(col("Director") == "Akira Kurosawa").select(col("US_Gross"), col("Worldwide_Gross"), col("US_DVD_Sales")).show()

+--------+---------------+------------+
|US_Gross|Worldwide_Gross|US_DVD_Sales|
+--------+---------------+------------+
|   48856|          48856|        null|
|  271736|         271736|        null|
+--------+---------------+------------+



In [None]:
moviesDF.fillna(0).groupBy("Director").agg(sum(col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales")).alias("Total_Gross")).orderBy("Director").show()

+--------------------+-----------+
|            Director|Total_Gross|
+--------------------+-----------+
|                null|79073427920|
|        Abel Ferrara|    2625598|
|          Adam McKay|  569568389|
|       Adam Shankman| 1520657429|
|         Adrian Lyne|  956305066|
|     Adrienne Shelly|   64427279|
|      Akira Kurosawa|     641184|
|           Alan Alda|   84976322|
|      Alan J. Pakula|  183785593|
|         Alan Parker|  276686501|
|        Alan Rudolph|     356574|
|       Albert Brooks|   23229908|
|       Albert Hughes|  118659670|
|Alejandro Gonzale...|  304029952|
|       Alex Kendrick|  139250136|
|         Alex Proyas|  826361390|
|     Alexander Payne|  381923811|
|       Alexandre Aja|  131979381|
|      Alfonso Cuaron| 1222725522|
|    Alfred Hitchcock|  196529484|
+--------------------+-----------+
only showing top 20 rows



In [None]:
moviesDF.select((col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales")).alias("Total_Gross")).select(sum("Total_Gross")).show()
moviesDF.fillna(0).select((col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales")).alias("Total_Gross")).select(sum("Total_Gross")).show()

+----------------+
|sum(Total_Gross)|
+----------------+
|    139190135783|
+----------------+

+----------------+
|sum(Total_Gross)|
+----------------+
|    432813952470|
+----------------+



Rememeber SQl expressions are NOT null safe!!!