In [6]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import time

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Query2_DataFrame") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load crime data
# Load crime data
# Load the 2010-2019 crime data
crime_df_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", 
    header=True
)

# Load the 2020-present crime data
crime_df_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", 
    header=True
)

crime_df = crime_df_2010_2019.union(crime_df_2020_present)

# parquet_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/part-00000-f07e4ac6-6590-4191-a971-857760026d74-c000.snappy.parquet"
# crime_df = spark.read.parquet(parquet_path)

# Clean up the data and filter out cases based on 'Status Desc'
# Open cases have 'Status Desc' as 'UNK' or 'InvestCont', others are considered closed
crime_df = crime_df.withColumn(
    "Case_Status",
    F.when(F.col("Status Desc").isin("UNK", "Invest Cont"), "Open").otherwise("Closed")
)
# crime_df.select("Status Desc", "Case_Status").show(10)
# Extract the year from 'Date Rptd' (timestamp column)
crime_df = crime_df.withColumn("Year", F.year(F.to_timestamp("DATE OCC", "MM/dd/yyyy hh:mm:ss a")))

# Group by year and area to calculate the closed case rate
year_area_group = crime_df.groupBy(
    F.col("Year"),
    F.col("AREA NAME").alias("Area")
).agg(
    F.sum(F.when(F.col("Case_Status") == "Closed", 1).otherwise(0)).alias("Closed_Cases"),
    F.count(F.col("Case_Status")).alias("Total_Cases")
)

# Calculate the closed case rate
year_area_group = year_area_group.withColumn(
    "Closed_Case_Rate",
    (F.col("Closed_Cases") / F.col("Total_Cases")) * 100
)

# Rank areas within each year based on the closed case rate
window_spec = Window.partitionBy("Year").orderBy(F.col("Closed_Case_Rate").desc())

ranked_df = year_area_group.withColumn(
    "Rank",
    F.row_number().over(window_spec)
)

# Filter the top 3 areas for each year
top_3_areas_df = ranked_df.filter(F.col("Rank") <= 3)

# Show the results
top_3_areas_df.select("Year", "Area", "Closed_Case_Rate", "Rank") \
    .orderBy("Year", "Rank") \
    .show(truncate=False)

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# Write results to parquet
group_number = "24"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/results/"

top_3_areas_df.write.mode("overwrite").parquet(s3_path + "q2_dataframe_output")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+------------------+----+
|Year|Area       |Closed_Case_Rate  |Rank|
+----+-----------+------------------+----+
|2010|Rampart    |32.84713448949121 |1   |
|2010|Olympic    |31.515289821999087|2   |
|2010|Harbor     |29.36028339237341 |3   |
|2011|Olympic    |35.040060090135206|1   |
|2011|Rampart    |32.4964471814306  |2   |
|2011|Harbor     |28.51336246316431 |3   |
|2012|Olympic    |34.29708533302119 |1   |
|2012|Rampart    |32.46000463714352 |2   |
|2012|Harbor     |29.509585848956675|3   |
|2013|Olympic    |33.58217940999398 |1   |
|2013|Rampart    |32.1060382916053  |2   |
|2013|Harbor     |29.723638951488557|3   |
|2014|Van Nuys   |32.0215235281705  |1   |
|2014|West Valley|31.49754809505847 |2   |
|2014|Mission    |31.224939855653567|3   |
|2015|Van Nuys   |32.265140677157845|1   |
|2015|Mission    |30.463762673676303|2   |
|2015|Foothill   |30.353001803658852|3   |
|2016|Van Nuys   |32.194518462124094|1   |
|2016|West Valley|31.40146437042384 |2   |
+----+-----