In [1]:
# Solution using parquet and SQL
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import date_format, to_timestamp
from pyspark.sql.functions import when, col, count
from pyspark.sql.types import FloatType

import time
parquet_path = "s3://groups-bucket-dblab-905418150721/group45/single_file"
results = []

spark = SparkSession.builder.appName("query 2 parquet and sql").getOrCreate()
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))
# Start timing
spark.catalog.clearCache()
start_time = time.time()

crime_data_df = spark.read.parquet(parquet_path,header=True, inferSchema=True)
#inferSchema is not really needed but it is used to stop lazily evaluation
#the code from here is the same as SQL_csv
read_time = time.time()

crime_data_df.createOrReplaceTempView("crime_data")


select_query = """
                with base_data as (
                    select 
                        date_format(to_timestamp(`date occ`, 'MM/dd/yyyy hh:mm:ss a'), 'yyyy') as year,
                        `area name`,
                        count(case when `status desc` in ('UNK', 'Invest Cont') then 1 end) as open,
                        count(case when `status desc` not in ('UNK', 'Invest Cont') then 1 end) as closed,
                        (100 * count(case when `status desc` not in ('UNK', 'Invest Cont') then 1 end) / 
                         (count(case when `status desc` in ('UNK', 'Invest Cont') then 1 end) + 
                          count(case when `status desc` not in ('UNK', 'Invest Cont') then 1 end))) as rate
                    from crime_data
                    group by `area name`, year
                ),
                ranked_data as (
                    select 
                        `area name`,
                        year,
                        rate,
                        rank() over(partition by year order by rate desc) as rank
                    from base_data
                )
                select 
                    year,
                    `area name`,
                    rate,
                    rank
                from ranked_data
                where rank < 4
                order by year, rank
            """


# Execute the query
crime_data_with_year = spark.sql(select_query)

crime_data_with_year.collect()
end_time = time.time()


# End timing
results.append(Row(
            API="SQL",
            File_Format="Parquet",
            Read_time=read_time-start_time,
            Execution_Time=end_time-read_time,
            Total_time= end_time-start_time
        ))
results_df = spark.createDataFrame(results)

# Save the DataFrame to S3 as JSON
results_df.write.mode("append").json("s3://groups-bucket-dblab-905418150721/group45/q2/results.json")

crime_data_with_year.show()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3237,application_1732639283265_3193,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: None
Executor Memory: 4743M
Executor Cores: 2
+----+-----------+------------------+----+
|year|  area name|              rate|rank|
+----+-----------+------------------+----+
|2010|    Rampart| 32.84713448949121|   1|
|2010|    Olympic|31.515289821999087|   2|
|2010|     Harbor| 29.36028339237341|   3|
|2011|    Olympic|  35.0400600901352|   1|
|2011|    Rampart|32.496447181430604|   2|
|2011|     Harbor|28.513362463164313|   3|
|2012|    Olympic| 34.29708533302119|   1|
|2012|    Rampart| 32.46000463714352|   2|
|2012|     Harbor| 29.50958584895668|   3|
|2013|    Olympic| 33.58217940999398|   1|
|2013|    Rampart|  32.1060382916053|   2|
|2013|     Harbor|29.723638951488553|   3|
|2014|   Van Nuys|  32.0215235281705|   1|
|2014|West Valley| 31.49754809505847|   2|
|2014|    Mission| 31.22493985565357|   3|
|2015|   Van Nuys|32.265140677157845|   1|
|2015|    Mission|30.463762673676303|   2|
|2015|   Foothill|30.353001803658852|   3|
|2016|   Van Nuys|32.1945184621

In [5]:
#get the results for question 2 b and a combined
#we keep only three decimal points for the time measurments
from pyspark.sql.functions import col, format_number
results_df = spark.read.json("s3://groups-bucket-dblab-905418150721/group45/q2/results.json")
results_df.select(
    col("API"),
    col("File_Format"),
    format_number(col("Read_time"), 3).alias("Read_time"),
    format_number(col("Execution_Time"), 3).alias("Execution_Time"),
    format_number(col("Total_time"), 3).alias("Total_time")
).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-----------+---------+--------------+----------+
|      API|File_Format|Read_time|Execution_Time|Total_time|
+---------+-----------+---------+--------------+----------+
|Dataframe|        CSV|   17.656|        12.201|    29.857|
|      SQL|    Parquet|    5.730|        12.051|    17.781|
|      SQL|        CSV|   18.664|         8.087|    26.751|
+---------+-----------+---------+--------------+----------+

In [6]:
#get the results for question 2 a
results_df.select(
    col("API"),
    col("File_Format"),
    format_number(col("Read_time"), 3).alias("Read_time"),
    format_number(col("Execution_Time"), 3).alias("Execution_Time"),
    format_number(col("Total_time"), 3).alias("Total_time")
).filter(col("File_Format")=="CSV").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-----------+---------+--------------+----------+
|      API|File_Format|Read_time|Execution_Time|Total_time|
+---------+-----------+---------+--------------+----------+
|Dataframe|        CSV|   17.656|        12.201|    29.857|
|      SQL|        CSV|   18.664|         8.087|    26.751|
+---------+-----------+---------+--------------+----------+