In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

# Initialize SparkSession
spark = SparkSession \
    .builder \
    .appName("Query2_SQL") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load crime data
# Load crime data
# Load the 2010-2019 crime data
crime_df_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", 
    header=True
)

# Load the 2020-present crime data
crime_df_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", 
    header=True
)
crime_df = crime_df_2010_2019.union(crime_df_2020_present)

# Clean up the data and classify cases as 'Open' or 'Closed'
crime_df = crime_df.withColumn(
    "Case_Status",
    F.when(F.col("Status Desc").isin("UNK", "Invest Cont"), "Open").otherwise("Closed")
)

# Create a new column 'Year' based on the 'Date Rptd' timestamp column
crime_df = crime_df.withColumn("Year", F.year(F.to_timestamp("Date Rptd", "MM/dd/yyyy hh:mm:ss a")))

# Register the DataFrame as a temporary SQL view
crime_df.createOrReplaceTempView("crime_data")

# SQL query to find the top 3 precincts with the highest closed case rates per year
query = """
SELECT 
    Year,
    Area,
    Closed_Case_Rate,
    Rank
FROM (
    SELECT 
        Year,
        `AREA NAME` AS Area,
        (SUM(CASE WHEN Case_Status = 'Closed' THEN 1 ELSE 0 END) / COUNT(*) * 100) AS Closed_Case_Rate,
        ROW_NUMBER() OVER (PARTITION BY Year ORDER BY (SUM(CASE WHEN Case_Status = 'Closed' THEN 1 ELSE 0 END) / COUNT(*) * 100) DESC) AS Rank
    FROM 
        crime_data
    GROUP BY 
        Year, `AREA NAME`
) AS subquery
WHERE Rank <= 3
ORDER BY Year, Rank
"""

# Execute the query
result_df = spark.sql(query)

# Show the results
result_df.show(truncate=False)


# Execute the query
result_df = spark.sql(query)

# Show the results
result_df.show(truncate=False)

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# Write results to S3 in Parquet format
group_number = "24"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/results/"

result_df.write.mode("overwrite").parquet(s3_path + "q2_sql_output")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),â€¦

+----+-----------+------------------+----+
|Year|Area       |Closed_Case_Rate  |Rank|
+----+-----------+------------------+----+
|2010|Rampart    |32.947355855318136|1   |
|2010|Olympic    |31.962706191728422|2   |
|2010|Harbor     |29.63203463203463 |3   |
|2011|Olympic    |35.212167689161554|1   |
|2011|Rampart    |32.511779630300836|2   |
|2011|Harbor     |28.652205202015008|3   |
|2012|Olympic    |34.414818310523835|1   |
|2012|Rampart    |32.9464181029429  |2   |
|2012|Harbor     |29.815133276010318|3   |
|2013|Olympic    |33.52812271731191 |1   |
|2013|Rampart    |32.08287360549221 |2   |
|2013|Harbor     |29.16422459266206 |3   |
|2014|Van Nuys   |31.80567315834039 |1   |
|2014|West Valley|31.31198995605775 |2   |
|2014|Mission    |31.16279069767442 |3   |
|2015|Van Nuys   |32.64134698172773 |1   |
|2015|West Valley|30.27597402597403 |2   |
|2015|Mission    |30.179460678380153|3   |
|2016|Van Nuys   |31.880755720117726|1   |
|2016|West Valley|31.54798761609907 |2   |
+----+-----