In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

# Initialize SparkSession
spark = SparkSession \
    .builder \
    .appName("Query2_SQL") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load the 2010-2019 crime data
crime_df_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", 
    header=True
)

# Load the 2020-present crime data
crime_df_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", 
    header=True
)
crime_df = crime_df_2010_2019.union(crime_df_2020_present)

# Clean up the data and classify cases as 'Open' or 'Closed'
crime_df = crime_df.withColumn(
    "Case_Status",
    F.when(F.col("Status Desc").isin("UNK", "Invest Cont"), "Open").otherwise("Closed")
)

# Create a new column 'Year' based on the 'Date Rptd' timestamp column
crime_df = crime_df.withColumn("Year", F.year(F.to_timestamp("DATE OCC", "MM/dd/yyyy hh:mm:ss a")))

# Register the DataFrame as a temporary SQL view
crime_df.createOrReplaceTempView("crime_data")

# SQL query to find the top 3 precincts with the highest closed case rates per year
query = """
SELECT 
    Year,
    Area,
    Closed_Case_Rate,
    Rank
FROM (
    SELECT 
        Year,
        `AREA NAME` AS Area,
        (SUM(CASE WHEN Case_Status = 'Closed' THEN 1 ELSE 0 END) / COUNT(*) * 100) AS Closed_Case_Rate,
        ROW_NUMBER() OVER (PARTITION BY Year ORDER BY (SUM(CASE WHEN Case_Status = 'Closed' THEN 1 ELSE 0 END) / COUNT(*) * 100) DESC) AS Rank
    FROM 
        crime_data
    GROUP BY 
        Year, `AREA NAME`
) AS subquery
WHERE Rank <= 3
ORDER BY Year, Rank
"""

# Execute the query
result_df = spark.sql(query)

# Show the results
result_df.show(truncate=False)

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# Write results to S3 in Parquet format
group_number = "24"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/results/"

result_df.write.mode("overwrite").parquet(s3_path + "q2_sql_output")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+------------------+----+
|Year|Area       |Closed_Case_Rate  |Rank|
+----+-----------+------------------+----+
|2010|Rampart    |32.84713448949121 |1   |
|2010|Olympic    |31.515289821999087|2   |
|2010|Harbor     |29.36028339237341 |3   |
|2011|Olympic    |35.040060090135206|1   |
|2011|Rampart    |32.4964471814306  |2   |
|2011|Harbor     |28.51336246316431 |3   |
|2012|Olympic    |34.29708533302119 |1   |
|2012|Rampart    |32.46000463714352 |2   |
|2012|Harbor     |29.509585848956675|3   |
|2013|Olympic    |33.58217940999398 |1   |
|2013|Rampart    |32.1060382916053  |2   |
|2013|Harbor     |29.723638951488557|3   |
|2014|Van Nuys   |32.0215235281705  |1   |
|2014|West Valley|31.49754809505847 |2   |
|2014|Mission    |31.224939855653567|3   |
|2015|Van Nuys   |32.265140677157845|1   |
|2015|Mission    |30.463762673676303|2   |
|2015|Foothill   |30.353001803658852|3   |
|2016|Van Nuys   |32.194518462124094|1   |
|2016|West Valley|31.40146437042384 |2   |
+----+-----