In [1]:
from pyspark.sql import SparkSession
import time
from pyspark.sql.functions import col, when

# Initialize a Spark session with specified configurations
spark = SparkSession \
        .builder \
        .appName("Query_1") \
        .config("spark.executor.instances", 4) \
        .getOrCreate()

# Load the dataset from an S3 bucket
df = spark.read.option("header", "true").csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/")

# Start the timer to measure execution time
start_time_df = time.time()

# Filter out rows with invalid latitude and longitude values (LAT and LON must not be 0)
filtered_df = df.filter((col("LAT") != 0) & (col("LON") != 0))

# Filter the dataset to include only rows where the crime description contains "AGGRAVATED ASSAULT"
df_filtered = filtered_df.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

# Categorize victims into age groups using the "Vict Age" column
df_with_age_groups = df_filtered.withColumn(
    "age_group",
    when((col("Vict Age").cast("int") > 0) & (col("Vict Age").cast("int") < 18), "Children")  # Ages 0-17
    .when((col("Vict Age").cast("int") >= 18) & (col("Vict Age").cast("int") <= 24), "Young adults")  # Ages 18-24
    .when((col("Vict Age").cast("int") >= 25) & (col("Vict Age").cast("int") <= 64), "Adults")  # Ages 25-64
    .when(col("Vict Age").cast("int") > 64, "Elderly")  # Age 65+
)

# Group by age group and count the number of occurrences for each group
age_group_counts = df_with_age_groups.groupBy("age_group").count()

# Order the results by the count in descending order
result_df = age_group_counts.orderBy(col("count").desc())

# Filter out rows with null age groups and display the results
result_df.filter(result_df["age_group"].isNotNull()).show()

# Calculate and print the total execution time for the DataFrame operations
execution_time_df = time.time() - start_time_df
print(f"Execution time for DataFrame API: {execution_time_df} seconds")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1833,application_1732639283265_1794,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|   age_group| count|
+------------+------+
|      Adults|121052|
|Young adults| 33588|
|    Children| 10825|
|     Elderly|  5985|
+------------+------+

Execution time for DataFrame API: 5.672727584838867 seconds

In [2]:
# Load the data into an RDD from the S3 bucket
data_rdd = spark.sparkContext.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/")

# Start the timer to measure execution time
start_time_rdd = time.time()

# Extract the first row (header) from the RDD
header = data_rdd.first()

# Filter out the header from the data RDD
data_rdd = data_rdd.filter(lambda row: row != header)

# Function to split each row into columns (using csv.reader to handle commas properly)
def split_row(row):
    import csv
    from io import StringIO
    return list(csv.reader(StringIO(row)))[0]

# Apply the split_row function to convert rows into columns
data_rdd = data_rdd.map(split_row)

# Extract column names from the header to find the indices for relevant columns
columns = header.split(",")
crm_cd_desc_idx = columns.index("Crm Cd Desc")
vict_age_idx = columns.index("Vict Age")
lat_idx = columns.index("LAT")
lon_idx = columns.index("LON")

# Filter the data RDD for aggravated assault crimes, and remove rows with invalid LAT/LON
filtered_rdd = data_rdd.filter(lambda row: 
    "AGGRAVATED ASSAULT" in row[crm_cd_desc_idx] and 
    (row[lat_idx] != '0' and row[lon_idx] != '0')
)

# Function to categorize the victims based on age and return a tuple for counting
def categorize_and_count(row):
    try:
        age = int(row[vict_age_idx])
        if 0 < age < 18:
            age_group = "Children"
        elif 18 <= age <= 24:
            age_group = "Young adults"
        elif 25 <= age <= 64:
            age_group = "Adults"
        elif age > 64:
            age_group = "Elderly"
        else:
            age_group = "Unknown"
    except ValueError:  # Handle cases where the age is not a valid integer
        age_group = "Unknown"
    return (age_group, 1)

# Categorize and count the occurrences of each age group, then sort by count in descending order
age_group_counts_rdd = filtered_rdd.map(categorize_and_count) \
                                 .reduceByKey(lambda a, b: a + b) \
                                 .sortBy(lambda x: x[1], ascending=False)

# Filter out the 'Unknown' age group
filtered_age_group_counts_rdd = age_group_counts_rdd.filter(lambda x: x[0] != "Unknown")

# Collect the results to the driver
results = filtered_age_group_counts_rdd.collect()

# Print the results
print("Age Group Counts:")
for age_group, count in results:
    print(f"{age_group}: {count}")

# Calculate and print the execution time for the RDD operations
execution_time_rdd = time.time() - start_time_rdd
print(f"Execution time for RDD API: {execution_time_rdd} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Age Group Counts:
Adults: 121052
Young adults: 33588
Children: 10825
Elderly: 5985
Execution time for RDD API: 24.93943738937378 seconds