# Query 1 
## Configurations

In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1790,application_1765289937462_1774,pyspark,idle,Link,Link,,
1791,application_1765289937462_1775,pyspark,idle,Link,Link,,
1792,application_1765289937462_1776,pyspark,idle,Link,Link,,
1795,application_1765289937462_1779,pyspark,idle,Link,Link,,
1799,application_1765289937462_1783,pyspark,idle,Link,Link,,
1800,application_1765289937462_1784,pyspark,idle,Link,Link,,


## DataFrame API without UDF

In [2]:
# We initialized a spark session with specific configurations, now we import
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, desc
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import time

#Beginning of timing
start_time = time.time()

# Define schema for crime data DataFrame
crime_data_full_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType())
])

# Create DataFrame
crime_data_full_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/", \
    header=True, \
    schema=crime_data_full_schema) 

crime_data_df = crime_data_full_df.select("DR_NO", "Crm Cd Desc", "Vict Age")

# Filter for "AGGRAVATED ASSAULT"
crime_data_aggrassault_df = crime_data_df.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

# Assign age groups 
crime_data_nonull_df = crime_data_aggrassault_df.na.drop(
    subset=["Vict Age"])

crime_data_aggrassault_agegroups_df = crime_data_nonull_df.withColumn(
    "Age Group",
    when(col("Vict Age") < 18, "Children")
    .when(col("Vict Age") < 25, "Young Adults")
    .when(col("Vict Age") < 65, "Adults")
    .otherwise("Elderly")
)

# Group by "Age Group" and count the occurences
age_group_counts_df = crime_data_aggrassault_agegroups_df.groupBy("Age Group").count()


# Order the results
result_df = age_group_counts_df.orderBy(col("count").desc())


# Show results
result_df.show()

# End of timing
execution_time = time.time() - start_time
print(f"Execution time: {execution_time} sec")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1802,application_1765289937462_1786,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|   Age Group| count|
+------------+------+
|      Adults|121660|
|Young Adults| 33758|
|    Children| 16014|
|     Elderly|  6011|
+------------+------+

Execution time: 17.812683582305908 sec

## DataFrame API with UDF

In [3]:
# We initialized a spark session with specific configurations, now we import
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc, udf
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import time

# Beginning of timing
start_time = time.time()

# UDF
def age_category_udf_code(age):
    if age < 18:
        return "Children"
    elif age < 25:
        return "Young Adults"
    elif age < 65:
        return "Adults"
    else:
        return "Elderly"

age_group_udf = udf(age_category_udf_code, StringType())

# Define schema for crime data DataFrame
crime_data_full_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType())
])

# Create DataFrame
crime_data_full_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/", \
    header=True, \
    schema=crime_data_full_schema) 

crime_data_df = crime_data_full_df.select("DR_NO", "Crm Cd Desc", "Vict Age")

# Filter for "AGGRAVATED ASSAULT"
crime_data_aggrassault_df = crime_data_df.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

# Assign age groups WITH UDF
crime_data_nonull_df = crime_data_aggrassault_df.na.drop(
    subset=["Vict Age"])

crime_data_aggrassault_agegroups_df = crime_data_nonull_df.withColumn(
    "Age Group",
    age_group_udf(col("Vict Age"))
)

# Group by "Age Group" and count the occurences
age_group_counts_df = crime_data_aggrassault_agegroups_df.groupBy("Age Group").count()


# Order the results
result_df = age_group_counts_df.orderBy(col("count").desc())

# Show results
result_df.show()

# End of timing
execution_time = time.time() - start_time
print(f"Execution time: {execution_time} sec")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|   Age Group| count|
+------------+------+
|      Adults|121660|
|Young Adults| 33758|
|    Children| 16014|
|     Elderly|  6011|
+------------+------+

Execution time: 8.670857429504395 sec

## RDD API

In [4]:
# We initialized a spark session with specific configurations, now we import
from pyspark.sql import SparkSession
import time

# Beginning of timing
start_time = time.time()

# Load data to RDD
crime_data_raw_rdd = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/")

# Filter out the header
header = crime_data_raw_rdd.first()
crime_data_rdd = crime_data_raw_rdd.filter(lambda line: line != header)

# Split each row into columns
def split_row(row):
    import csv
    from io import StringIO
    return list(csv.reader(StringIO(row)))[0]

crime_data_rdd = crime_data_rdd.map(split_row)

# Extract position of needed columns
columns_wthquotes = header.split(",")
columns = [col.strip('"') for col in columns_wthquotes]
crm_cd_desc_idx = columns.index("Crm Cd Desc")
vict_age_idx = columns.index("Vict Age")

# Filter for "AGGRAVATED ASSAULTS"
aggr_assaults_rdd = crime_data_rdd.filter(lambda row: 
                                                    "AGGRAVATED ASSAULT" in row[crm_cd_desc_idx])

# Assign age groups function
def categorize_age(row):
    try:
        age = int(row[vict_age_idx])
        if age < 18:
            age_group = "Children"
        elif age < 25:
            age_group = "Young Adults"
        elif age < 65:
            age_group = "Adults"
        else:
            age_group = "Elderly"
    except ValueError:
        age_group = "Unknown"
    return (age_group, 1)

# Count occurences
age_group_counts_rdd = aggr_assaults_rdd.map(categorize_age) \
                                        .reduceByKey(lambda a, b: a + b) \
                                        .sortBy(lambda x: x[1], ascending=False)

# Filter out the Unknown
final_results_rdd = age_group_counts_rdd.filter(lambda x: x[0] != "Unknown")

# Collect the results
results = final_results_rdd.collect()

# Print the results
print("--- Results: Aggravated Assaults per Age Group ---")
print("Age Group | Number of Victims")
print("------------------------------")
for age_group, count in results:
    print(f"{age_group.ljust(13)} | {count}")

# End of timing
execution_time = time.time() - start_time
print(f"Execution time: {execution_time} sec")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

--- Results: Aggravated Assaults per Age Group ---
Age Group | Number of Victims
------------------------------
Adults        | 121660
Young Adults  | 33758
Children      | 16014
Elderly       | 6011
Execution time: 10.512954711914062 sec