In [9]:
# imports
import time

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, desc, rank, sum as _sum
from pyspark.sql.types import *
from pyspark.sql.window import Window

# Start Spark Session
spark = SparkSession.builder \
    .appName("Query 1") \
    .config("spark.executor.instances", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
# load datasets we need for query 1

# Paths for dataset
crime_data_2010_2019 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_2020 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"


# Load datasets
crime_data = spark.read.csv(crime_data_2010_2019, header=True, inferSchema=True).union(
    spark.read.csv(crime_data_2020, header=True, inferSchema=True))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame implementation

In [11]:
# Start timing
start_time_df = time.time()

# Filter for aggravated assault crimes
assault_df = crime_data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

# Add age group column
age_group_df = assault_df.withColumn(
    "Victim_Age_Group",
    when((col("Vict Age") < 18) & (col("Vict Age") > 0), "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Elderly")
    .otherwise("Unknown")
)

# Group by age group and count occurrences
age_group_counts_df = age_group_df.groupBy("Victim_Age_Group").agg(count("*").alias("Incident_Count"))

# Sort the results in descending order of incident count
sorted_age_group_counts_df = age_group_counts_df.orderBy(col("Incident_Count").desc())

# Show results
sorted_age_group_counts_df.show()

# End timing
end_time_df = time.time()

# Calculate elapsed time
elapsed_time_df = end_time_df - start_time_df
print(f"DataFrame Implementation Time: {elapsed_time_df:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------+
|Victim_Age_Group|Incident_Count|
+----------------+--------------+
|          Adults|        121093|
|    Young Adults|         33605|
|        Children|         10830|
|         Elderly|          5985|
|         Unknown|          5098|
+----------------+--------------+

DataFrame Implementation Time: 4.30 seconds

RDD implementation

In [12]:
# Start timing
start_time_rdd = time.time()

# Filter for aggravated assault crimes
assault_rdd = crime_data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT")).rdd

# Map the data to create age groups
age_group_rdd = assault_rdd.map(
    lambda row: (
        "Children" if 0 < row["Vict Age"] < 18 else
        "Young Adults" if 18 <= row["Vict Age"] <= 24 else
        "Adults" if 25 <= row["Vict Age"] <= 64 else
        "Elderly" if row["Vict Age"] > 64 else "Unknown",
        1
    )
)

# Reduce by key to count the occurrences of each age group
age_group_counts_rdd = age_group_rdd.reduceByKey(lambda x, y: x + y)

# Sort by count in descending order
sorted_age_group_counts_rdd = age_group_counts_rdd.sortBy(lambda x: x[1], ascending=False)

# Collect and display the results
sorted_age_group_counts = sorted_age_group_counts_rdd.collect()
for group, count in sorted_age_group_counts:
    print(f"{group}: {count}")
    
# End timing
end_time_rdd = time.time()

# Calculate elapsed time
elapsed_time_rdd = end_time_rdd - start_time_rdd
print(f"RDD Implementation Time: {elapsed_time_rdd:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Adults: 121093
Young Adults: 33605
Children: 10830
Elderly: 5985
Unknown: 5098
RDD Implementation Time: 7.35 seconds

In [13]:
print(f"DataFrame Implementation Time: {elapsed_time_df:.2f} seconds")
print(f"RDD Implementation Time: {elapsed_time_rdd:.2f} seconds")

if elapsed_time_df < elapsed_time_rdd:
    print("DataFrame API is faster.")
else:
    print("RDD API is faster.")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame Implementation Time: 4.30 seconds
RDD Implementation Time: 7.35 seconds
DataFrame API is faster.