<a href="https://colab.research.google.com/github/mascee/Spark_Exercises/blob/main/read_and_write_parquet_solution_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.4'
spark_version = 'spark-3.5.4'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (91.189.91.81)] [Connected to cloud.r-project.org (65.9.86.12)                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                                                    Get:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
                                                                                                    Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [4 InRelease 14.2 kB/129 kB 11%] [Connected to cloud.r-project.org (65.9.86.12)] [Connecting to r                                                                                                    Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get

In [2]:
# Import packages
from pyspark.sql import SparkSession
# Import the time module so we can time our queries.
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").config("spark.driver.memory", "2g").getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/3/NYC_Building_Violations.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("NYC_Building_Violations.csv"), sep=",", header=True)
df.show()

+----------------+----+-------+-----+-----+----------+-------------------+----------------+------------+--------------------+----------------+--------------------+-------------+--------------------+----------+--------------------+--------------------+--------------------+
|ISN_DOB_BIS_VIOL|BORO|    BIN|BLOCK|  LOT|ISSUE_DATE|VIOLATION_TYPE_CODE|VIOLATION_NUMBER|HOUSE_NUMBER|              STREET|DISPOSITION_DATE|DISPOSITION_COMMENTS|DEVICE_NUMBER|         DESCRIPTION|ECB_NUMBER|              NUMBER|  VIOLATION_CATEGORY|      VIOLATION_TYPE|
+----------------+----+-------+-----+-----+----------+-------------------+----------------+------------+--------------------+----------------+--------------------+-------------+--------------------+----------+--------------------+--------------------+--------------------+
|         2286033|   1|1009713|00577|00019|  20180507|                  E|     9027/627971|          34|        WEST 14TH ST|        20220509|PPN203 AOC SUB 05...|      1P13420|    

In [4]:
 # Let's create a view with our DataFrame and run SQL that will sum up the boroughs by the type of violation.
# We can output the time this step runs in seconds.
# Because we are timing the executions, remember to run twice to eliminate the "load time" from the discussion.

df.createOrReplaceTempView('violations')
start_time = time.time()

spark.sql("""select VIOLATION_TYPE, sum(BORO) from violations group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+--------------------+---------+
|      VIOLATION_TYPE|sum(BORO)|
+--------------------+---------+
|LL10/80-LOCAL LAW...|   3609.0|
|LL11/98-LOCAL LAW...|   9285.0|
|HVIOS-NYCHA ELEV ...|    969.0|
|P-PLUMBING       ...|  29480.0|
|ACH1-(NYCHA) - EL...|   4949.0|
|LANDMRK-LANDMARK ...|   5599.0|
|LL5-LOCAL LAW 5/7...|   1363.0|
|IMD-IMMEDIATE EME...|     13.0|
|B-BOILER         ...|  17042.0|
|FISP-FACADE SAFET...|   6889.0|
|EGNCY-EMERGENCY  ...|  12607.0|
|ES-ELECTRIC SIGNS...|  18378.0|
|                NULL|    148.0|
|L1198-LOCAL LAW 1...|  10656.0|
|HBLVIO-HIGH PRESS...|  14628.0|
|BENCH-FAILURE TO ...| 110285.0|
|RWNRF-RETAINING W...|   4007.0|
|FISPNRF-NO REPORT...|  21017.0|
|LL2604-PHOTOLUMIN...|    679.0|
|LL2604S-SPRINKLER...|   1513.0|
+--------------------+---------+
only showing top 20 rows

--- 15.868293046951294 seconds ---


In [5]:
# Write out the data in parquet format
# Note: That this is pretty much the same as writing out to a csv to your local directory.
# We are telling Spark to overwrite all of the data if it already exists
df.write.parquet('parquet_violations', mode='overwrite')



*   click the folder icon on the left of the notebook to expose the folders and files stored in your colab enviornment.  Notice that a new folder is present with the same name as your parquet file (parquet_title_basic)
*   inside of it you will find 'part-*.parquet' files and a '_SUCCESS' file.
*  The '_SUCCESS' file is created when Spark creates a Parquet folder
*  the part-* files are binary files that store your compressed data in columnar format





In [6]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('parquet_violations')

In [7]:
# A parquet formatted DataFrame has all the same methods as a row-based DataFrame
# We can convert the DataFrame to a view.
p_df.createOrReplaceTempView('p_violations')

In [None]:
# Run the same sql as above.  (Note: If you have small datasets it IS possible that times may be very close.)
# Because we are timing the executions, remember to run twice to eliminate the "load time" from the discussion.

start_time = time.time()
spark.sql("""select VIOLATION_TYPE, sum(BORO) from p_violations group by 1""").show()
print("--- %s seconds ---" % (time.time() - start_time))

+--------------------+---------+
|      VIOLATION_TYPE|sum(BORO)|
+--------------------+---------+
|LL10/80-LOCAL LAW...|   3609.0|
|LL11/98-LOCAL LAW...|   9285.0|
|HVIOS-NYCHA ELEV ...|    969.0|
|P-PLUMBING       ...|  29480.0|
|ACH1-(NYCHA) - EL...|   4949.0|
|LANDMRK-LANDMARK ...|   5599.0|
|LL5-LOCAL LAW 5/7...|   1363.0|
|IMD-IMMEDIATE EME...|     13.0|
|FISP-FACADE SAFET...|   6889.0|
|B-BOILER         ...|  17042.0|
|EGNCY-EMERGENCY  ...|  12607.0|
|ES-ELECTRIC SIGNS...|  18378.0|
|                NULL|    148.0|
|L1198-LOCAL LAW 1...|  10656.0|
|HBLVIO-HIGH PRESS...|  14628.0|
|BENCH-FAILURE TO ...| 110285.0|
|RWNRF-RETAINING W...|   4007.0|
|FISPNRF-NO REPORT...|  21017.0|
|LL2604-PHOTOLUMIN...|    679.0|
|LL2604S-SPRINKLER...|   1513.0|
+--------------------+---------+
only showing top 20 rows

--- 4.571583271026611 seconds ---


In [8]:
# Writing out a csv file from Spark will also create a folder with "part" files.
# These files are not binary or compressed and in reality are just normal csv files broken into partitions.
# You can see the folder 'out_violations.csv' in your local directory.
df.write.csv('out_violations.csv', mode='overwrite')