In [1]:
## Solution using SQL
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import date_format, to_timestamp
from pyspark.sql.functions import when, col, count
from pyspark.sql.types import FloatType

import time
results = []
spark = SparkSession.builder.appName("SQL query 2").getOrCreate()
spark.catalog.clearCache()

conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

csv_files = [
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
]
# Start timing before read in order to capture read time
spark.catalog.clearCache()
start_time = time.time()

crime_data_df = spark.read.csv(csv_files, header=True, inferSchema=True)
read_time = time.time()

crime_data_df.createOrReplaceTempView("crime_data")

# query that has the same effect as the dataframe code
# in order to use intermediate results for simplicity we used the with as command
select_query = """
                with base_data as (
                    select 
                        date_format(to_timestamp(`date occ`, 'MM/dd/yyyy hh:mm:ss a'), 'yyyy') as year,
                        `area name`,
                        count(case when `status desc` in ('UNK', 'Invest Cont') then 1 end) as open,
                        count(case when `status desc` not in ('UNK', 'Invest Cont') then 1 end) as closed,
                        (100 * count(case when `status desc` not in ('UNK', 'Invest Cont') then 1 end) / 
                         (count(case when `status desc` in ('UNK', 'Invest Cont') then 1 end) + 
                          count(case when `status desc` not in ('UNK', 'Invest Cont') then 1 end))) as rate
                    from crime_data
                    group by `area name`, year
                ),
                ranked_data as (
                    select 
                        `area name`,
                        year,
                        rate,
                        rank() over(partition by year order by rate desc) as rank
                    from base_data
                )
                select 
                    year,
                    `area name`,
                    rate,
                    rank
                from ranked_data
                where rank < 4
                order by year, rank
            """


# Execute the query
crime_data_with_year = spark.sql(select_query)

crime_data_with_year.collect()
# End timing
end_time = time.time()
results.append(Row(
            API="SQL",
            File_Format="CSV",
            Read_time=read_time-start_time,
            Execution_Time=end_time-read_time,
            Total_time= end_time-start_time
        ))
crime_data_with_year.show()
results_df = spark.createDataFrame(results)

# Save the DataFrame to S3 as JSON
results_df.write.mode("append").json("s3://groups-bucket-dblab-905418150721/group45/q2/results.json")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3235,application_1732639283265_3191,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: None
Executor Memory: 4743M
Executor Cores: 2
+----+-----------+------------------+----+
|year|  area name|              rate|rank|
+----+-----------+------------------+----+
|2010|    Rampart| 32.84713448949121|   1|
|2010|    Olympic|31.515289821999087|   2|
|2010|     Harbor| 29.36028339237341|   3|
|2011|    Olympic|  35.0400600901352|   1|
|2011|    Rampart|32.496447181430604|   2|
|2011|     Harbor|28.513362463164313|   3|
|2012|    Olympic| 34.29708533302119|   1|
|2012|    Rampart| 32.46000463714352|   2|
|2012|     Harbor| 29.50958584895668|   3|
|2013|    Olympic| 33.58217940999398|   1|
|2013|    Rampart|  32.1060382916053|   2|
|2013|     Harbor|29.723638951488553|   3|
|2014|   Van Nuys|  32.0215235281705|   1|
|2014|West Valley| 31.49754809505847|   2|
|2014|    Mission| 31.22493985565357|   3|
|2015|   Van Nuys|32.265140677157845|   1|
|2015|    Mission|30.463762673676303|   2|
|2015|   Foothill|30.353001803658852|   3|
|2016|   Van Nuys|32.1945184621

In [None]:
# Convert to parquet
csv_files = [
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
]

crime_data_df = spark.read.csv(csv_files, header=True, inferSchema=True)

# Path to save the Parquet file
parquet_path = "s3://groups-bucket-dblab-905418150721/group45/single_file"

# Reduce to a single partition
crime_data_df.coalesce(1).write.parquet(parquet_path, mode="overwrite")

