<a href="https://colab.research.google.com/github/mascee/Spark_Exercises/blob/main/parquet_partitions_solution_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.4'
spark_version = 'spark-3.5.4'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,200 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,560 kB]
Get:13 http://security.ubuntu.co

In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder\
    .appName("SparkSQL")\
    .config("spark.sql.debug.maxToStringFields", 2000)\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/3/DelayedFlights.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("DelayedFlights.csv"), sep=",", header=True)
df.show()

+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
| id|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|  0|2008|    1|         3|        4|   2003|      1955|   2211|      2225|       

In [4]:
# Create a temporary view
df.createOrReplaceTempView('delays')

In [5]:
# Run a sql query that groups the data on UniqueCarrier
# note the time functions will track the time it takes to load and run the data
# we are only interested in the time it take to run so run this cell twice.
start_time = time.time()

spark.sql("""select UniqueCarrier,sum(CRSElapsedTime), count(*) from delays group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------------+--------+
|UniqueCarrier|sum(CRSElapsedTime)|count(1)|
+-------------+-------------------+--------+
|           UA|        1.3998834E7|   82022|
|           AA|        1.7721836E7|  103120|
|           NW|          6761017.0|   48410|
|           EV|          4284049.0|   42782|
|           B6|          4169064.0|   22868|
|           DL|          8245701.0|   48888|
|           OO|          6883377.0|   73680|
|           F9|          2338358.0|   16006|
|           YV|          3216400.0|   34890|
|           US|          8759953.0|   53873|
|           AQ|            99698.0|     750|
|           MQ|          7710479.0|   82505|
|           OH|          3318613.0|   29152|
|           HA|           345580.0|    2597|
|           XE|          7386620.0|   62539|
|           AS|          2527656.0|   16553|
|           FL|          4807695.0|   37201|
|           CO|          8693653.0|   44282|
|           WN|        2.4182455E7|  214624|
|         

In [6]:
# Write out the data in parquet format
df.write.parquet('parquet_delayed', mode='overwrite')

In [7]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('parquet_delayed')

In [8]:
# A parquet formatted DataFrame has all the same methods as a row-based dataframe
# We can convert the dataframe to a view.
p_df.createOrReplaceTempView('p_delays')

In [9]:
start_time = time.time()

spark.sql("""select UniqueCarrier, count(*) from p_delays group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+--------+
|UniqueCarrier|count(1)|
+-------------+--------+
|           UA|   82022|
|           AA|  103120|
|           NW|   48410|
|           EV|   42782|
|           B6|   22868|
|           DL|   48888|
|           OO|   73680|
|           F9|   16006|
|           YV|   34890|
|           US|   53873|
|           AQ|     750|
|           MQ|   82505|
|           OH|   29152|
|           HA|    2597|
|           XE|   62539|
|           AS|   16553|
|           FL|   37201|
|           CO|   44282|
|           WN|  214624|
|           9E|   31833|
+-------------+--------+

--- 1.3369975090026855 seconds ---


In [11]:
# Here is another sample
start_time = time.time()

spark.sql("""select UniqueCarrier,sum(CRSElapsedTime), count(*) from p_delays group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------------+--------+
|UniqueCarrier|sum(CRSElapsedTime)|count(1)|
+-------------+-------------------+--------+
|           UA|        1.3998834E7|   82022|
|           AA|        1.7721836E7|  103120|
|           NW|          6761017.0|   48410|
|           EV|          4284049.0|   42782|
|           B6|          4169064.0|   22868|
|           DL|          8245701.0|   48888|
|           OO|          6883377.0|   73680|
|           F9|          2338358.0|   16006|
|           YV|          3216400.0|   34890|
|           US|          8759953.0|   53873|
|           AQ|            99698.0|     750|
|           MQ|          7710479.0|   82505|
|           OH|          3318613.0|   29152|
|           HA|           345580.0|    2597|
|           XE|          7386620.0|   62539|
|           AS|          2527656.0|   16553|
|           FL|          4807695.0|   37201|
|           CO|          8693653.0|   44282|
|           WN|        2.4182455E7|  214624|
|         

In [17]:
# Partition our data by UniqueCarrier
df.write.partitionBy("UniqueCarrier").mode("overwrite").parquet("delayed_partitioned")

In [18]:
# Read in our new parquet formatted data
p_df_p=spark.read.parquet('delayed_partitioned')

In [19]:
# Convert the DataFrame to a view.
p_df_p.createOrReplaceTempView('p_delays_p')

In [20]:
# Query the partitioned data on the Partition key.
start_time = time.time()

spark.sql("""select UniqueCarrier, count(*) from p_delays_p group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+--------+
|UniqueCarrier|count(1)|
+-------------+--------+
|           UA|   82022|
|           AA|  103120|
|           NW|   48410|
|           DL|   48888|
|           OO|   73680|
|           US|   53873|
|           MQ|   82505|
|           XE|   62539|
|           CO|   44282|
|           WN|  214624|
|           EV|   42782|
|           B6|   22868|
|           F9|   16006|
|           YV|   34890|
|           AQ|     750|
|           OH|   29152|
|           HA|    2597|
|           AS|   16553|
|           FL|   37201|
|           9E|   31833|
+-------------+--------+

--- 0.5682978630065918 seconds ---


In [21]:
# Grouping by partition key and aggregating data.
start_time = time.time()

spark.sql("""select UniqueCarrier,sum(CRSElapsedTime) from p_delays_p group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------------+
|UniqueCarrier|sum(CRSElapsedTime)|
+-------------+-------------------+
|           UA|        1.3998834E7|
|           AA|        1.7721836E7|
|           NW|          6761017.0|
|           DL|          8245701.0|
|           OO|          6883377.0|
|           US|          8759953.0|
|           MQ|          7710479.0|
|           XE|          7386620.0|
|           CO|          8693653.0|
|           WN|        2.4182455E7|
|           EV|          4284049.0|
|           B6|          4169064.0|
|           F9|          2338358.0|
|           YV|          3216400.0|
|           AQ|            99698.0|
|           OH|          3318613.0|
|           HA|           345580.0|
|           AS|          2527656.0|
|           FL|          4807695.0|
|           9E|          3255692.0|
+-------------+-------------------+

--- 2.0538032054901123 seconds ---


In [22]:
# Another query filtering on the key.
start_time = time.time()
spark.sql("""Select UniqueCarrier, sum(DepDelay) as total_delayed from p_delays_p where UniqueCarrier='US' group by 1""").show()
print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------+
|UniqueCarrier|total_delayed|
+-------------+-------------+
|           US|    2077273.0|
+-------------+-------------+

--- 0.812269926071167 seconds ---


In [23]:
# Same query as above against the parquet (non-partitioned) data.
start_time = time.time()
spark.sql("""Select UniqueCarrier, sum(DepDelay) as total_delayed from p_delays where UniqueCarrier='US' group by 1""").show()
print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------+
|UniqueCarrier|total_delayed|
+-------------+-------------+
|           US|    2077273.0|
+-------------+-------------+

--- 0.938183069229126 seconds ---


In [24]:
# Here is a query that doesn't use the partition key at all (against the parquet data)
start_time = time.time()
spark.sql("""Select distinct UniqueCarrier, TailNum from p_delays where TailNum='N712SW' """).show()
print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------+
|UniqueCarrier|TailNum|
+-------------+-------+
|           WN| N712SW|
+-------------+-------+

--- 0.7862582206726074 seconds ---


In [25]:
# Here is a query that doesn't use the partition key at all (against the partitioned parquet data)
start_time = time.time()
spark.sql("""Select distinct UniqueCarrier, TailNum from p_delays_p where TailNum='N712SW' """).show()
print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------+
|UniqueCarrier|TailNum|
+-------------+-------+
|           WN| N712SW|
+-------------+-------+

--- 1.5582687854766846 seconds ---
