In [1]:
## Solution using Dataframe
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import when, rank,col, count,date_format, to_timestamp,udf
from pyspark.sql.types import FloatType
from pyspark.sql.window import Window
import time
spark = SparkSession.builder.appName("Query 2 DF").getOrCreate()
results = []
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))
csv_files = [
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
]
# Start timing before read in order to capture read time
spark.catalog.clearCache()
start_time = time.time()
crime_data_df = spark.read.csv(csv_files, header=True, inferSchema=True)
read_time = time.time()


selected_columns = crime_data_df.select("AREA NAME","Status Desc","DATE OCC" )
# foramt the dataset and categorize crimes 
crime_data_with_year = selected_columns.withColumn(
    "Year",
    date_format(to_timestamp(col("Date Occ"), "MM/dd/yyyy hh:mm:ss a"), "yyyy")
).select("Year","AREA NAME","Status Desc")
categorized_df = crime_data_with_year.withColumn(
    "Case Status",
    when(col("Status Desc").isin("UNK", "Invest Cont"), "Open").otherwise("Closed")
)
#count the number of crimes for each category
list_categorized_df = categorized_df.groupby("AREA NAME","Year").agg(
        count(when(col("Case Status") == "Closed", 1)).alias("Closed"),
        count(when(col("Case Status") == "Open", 1)).alias("Open")
)

#define udf, in order to count closed rate. It is not strictly necessary 
#but we used a udf to try a different implementation
def closed_rate(closed,still_open):
      return 100*closed/(closed+still_open)


closed_rate_udf = udf(closed_rate, FloatType())
crime_data_with_rate = list_categorized_df.withColumn(
     "Rate",
     closed_rate_udf(col("Closed"),col("Open"))
).select("Year","AREA NAME","Rate")


# Define a window specification to rank rows within each AREA NAME and Year
window_spec = Window.partitionBy("Year").orderBy(col("Rate").desc())

# Add a rank column to the dataframe
top_3_rates_df = crime_data_with_rate.withColumn("rank", rank().over(window_spec)).filter(col("rank") <= 3).orderBy("Year")
top_3_rates_df.collect()
end_time = time.time()
results.append(Row(
            API="Dataframe",
            File_Format="CSV",
            Read_time=read_time-start_time,
            Execution_Time=end_time-read_time,
            Total_time=end_time-start_time
        ))
top_3_rates_df.show()
results_df = spark.createDataFrame(results)

# Save the DataFrame to S3 as JSON
results_df.write.mode("overwrite").json("s3://groups-bucket-dblab-905418150721/group45/q2/results.json")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3231,application_1732639283265_3187,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: None
Executor Memory: 4743M
Executor Cores: 2
+----+-----------+---------+----+
|Year|  AREA NAME|     Rate|rank|
+----+-----------+---------+----+
|2010|    Rampart|32.847134|   1|
|2010|    Olympic| 31.51529|   2|
|2010|     Harbor|29.360283|   3|
|2011|    Olympic|35.040062|   1|
|2011|    Rampart| 32.49645|   2|
|2011|     Harbor|28.513363|   3|
|2012|    Olympic|34.297085|   1|
|2012|    Rampart|32.460003|   2|
|2012|     Harbor|29.509586|   3|
|2013|    Olympic| 33.58218|   1|
|2013|    Rampart|32.106037|   2|
|2013|     Harbor|29.723639|   3|
|2014|   Van Nuys|32.021523|   1|
|2014|West Valley|31.497547|   2|
|2014|    Mission| 31.22494|   3|
|2015|   Van Nuys| 32.26514|   1|
|2015|    Mission|30.463762|   2|
|2015|   Foothill|30.353003|   3|
|2016|   Van Nuys| 32.19452|   1|
|2016|West Valley|31.401464|   2|
+----+-----------+---------+----+
only showing top 20 rows