In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import trim, lower, col, when, desc
import time
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Query1_DataFrame") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

# Start timing
start_time = time.time()
# Load crime data
# Load the 2010-2019 crime data
crime_df_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", 
    header=True
)

# Load the 2020-present crime data
crime_df_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", 
    header=True
)
crime_df = crime_df_2010_2019.union(crime_df_2020_present)


# crime_df.select("Crm Cd Desc", "LAT", "LON").show(10, truncate=False)

# Clean up 'Crm Cd Desc' column and apply filters
filtered_df = crime_df.filter(
    (lower(trim(col("Crm Cd Desc"))).contains("aggravated assault")) &  # Case-insensitive match
    (col("LAT").isNotNull() & col("LON").isNotNull() & (col("LAT") != 0) & (col("LON") != 0))  # Valid coordinates
)

# filtered_df.select("Crm Cd Desc", "LAT", "LON").show(10, truncate=False)
# filtered_df.count()

# Add Age Group column based on 'Vict Age'
age_grouped_df = filtered_df.withColumn(
    "Age_Group",
    when(col("Vict Age").cast("int") < 18, "Children")
    .when((col("Vict Age").cast("int") >= 18) & (col("Vict Age").cast("int") <= 24), "Young Adults")
    .when((col("Vict Age").cast("int") >= 25) & (col("Vict Age").cast("int") <= 64), "Adults")
    .when(col("Vict Age").cast("int") > 64, "Seniors")
)

# Group by Age Group and count incidents, then sort
result_df = age_grouped_df.groupBy("Age_Group").count().orderBy(desc("count"))

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# Write results
group_number = "24"

s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/results/"

result_df.write.mode("overwrite").parquet(s3_path + "q1_dataframe_output")

# Show results
result_df.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken: 7.61 seconds
+------------+------+
|   Age_Group| count|
+------------+------+
|      Adults|121052|
|Young Adults| 33588|
|    Children| 15923|
|     Seniors|  5985|
+------------+------+