In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
900,application_1765289937462_0893,pyspark,idle,Link,Link,,
902,application_1765289937462_0895,pyspark,idle,Link,Link,,
907,application_1765289937462_0900,pyspark,idle,Link,Link,,
911,application_1765289937462_0904,pyspark,idle,Link,Link,,
918,application_1765289937462_0911,pyspark,idle,Link,Link,,
920,application_1765289937462_0913,pyspark,idle,Link,Link,,
922,application_1765289937462_0915,pyspark,idle,Link,Link,,
928,application_1765289937462_0921,pyspark,idle,Link,Link,,


In [2]:
import io
import csv
import time

from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
from pyspark.sql.functions import col, when, count, desc, udf

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
931,application_1765289937462_0924,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
CRIMES_PATH_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
CRIMES_PATH_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"

spark = SparkSession.builder.appName("Query 1").getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
def load_crime_data(path):
    data = spark.read.csv(path, header=True)
    df = data.select(
        col("DR_NO").cast("integer").alias("id"),
        col("Crm Cd Desc").alias("crime"),
        col("Vict Age").cast("integer").alias("victim_age")
    )
    return df

crimes1 = load_crime_data(CRIMES_PATH_1)
crimes2 = load_crime_data(CRIMES_PATH_2)

# Concatenate the two datasets, "union" is a transformation -> lazy
crimes = crimes1.union(crimes2)

#crimes.printSchema()
#print(crimes.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
crimes.show(n=10) # Do NOT run when measuring time!

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+----------+
|       id|               crime|victim_age|
+---------+--------------------+----------+
|  1307355|VIOLATION OF COUR...|        48|
| 11401303|VANDALISM - FELON...|         0|
| 70309629|OTHER MISCELLANEO...|         0|
| 90631215|VIOLATION OF COUR...|        47|
|100100501|     RAPE, ATTEMPTED|        47|
|100100506|SHOPLIFTING - PET...|        23|
|100100508|BURGLARY FROM VEH...|        46|
|100100509|ASSAULT WITH DEAD...|        51|
|100100510|ASSAULT WITH DEAD...|        30|
|100100511|THEFT-GRAND ($950...|        55|
+---------+--------------------+----------+
only showing top 10 rows

In [6]:
filtered_crimes = crimes.filter(
    col("crime").contains("AGGRAVATED ASSAULT")
)
#print(filtered_crimes.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Query 1 DF with no UDF

groups_no_udf = filtered_crimes.withColumn("age_group",
    when(col("victim_age").isNull() | (col("victim_age") <= 0), "Invalid")
    .when(col("victim_age") < 18, "Children")
    .when(col("victim_age").between(18, 24), "Young adults")
    .when(col("victim_age").between(25, 64), "Adults")
    .when(col("victim_age") > 64, "Seniors")
    .otherwise("Invalid")
)

df_no_udf_start = time.time()
result = groups_no_udf.groupBy("age_group").agg(count("*").alias("victim_count")).orderBy(desc("victim_count"))
result.show()
df_no_udf_end = time.time()

df_no_udf_time = df_no_udf_end - df_no_udf_start
print(df_no_udf_time)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------------+
|   age_group|victim_count|
+------------+------------+
|      Adults|      121660|
|Young adults|       33758|
|    Children|       10904|
|     Seniors|        6011|
|     Invalid|        5110|
+------------+------------+

7.238358020782471

In [7]:
# UDF used in DF-UDF and RDD solutions
def get_age_group(age):
    if age is None or int(age) <= 0:
        return "Invalid"
    age = int(age)
    if age < 18:
        return "Children"
    elif age < 25:
        return "Young Adults"
    elif age < 65:
        return "Adults"
    elif age > 64:
        return "Seniors"
    else:
        return "Invalid"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Query 1 DF with UDF

age_udf = udf(get_age_group, StringType())

df_udf_start = time.time()
groups_udf = filtered_crimes.withColumn("age_group", age_udf(col("victim_age")))

result = groups_udf.groupBy("age_group").agg(count("*").alias("victim_count")).orderBy(desc("victim_count"))
result.show()
df_udf_end = time.time()

df_udf_time = df_udf_end - df_udf_start
print(df_udf_time)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------------+
|   age_group|victim_count|
+------------+------------+
|      Adults|      121660|
|Young Adults|       33758|
|    Children|       10904|
|     Seniors|        6011|
|     Invalid|        5110|
+------------+------------+

8.533761262893677

In [5]:
# Query 1 with RDD

sc = spark.sparkContext

crimes1 = sc.textFile(CRIMES_PATH_1)
crimes2 = sc.textFile(CRIMES_PATH_2)

header = crimes1.first()
#print(header)

crimes_raw = crimes1.union(crimes2).filter(lambda line: line != header)
#print(crimes_raw.collect()[0].split("\""))

ID_INDEX = 0
CRIME_DESC_INDEX = 9
VICTIM_AGE_INDEX = 11
def parse_crimes(line):
    try:
        reader = csv.reader(io.StringIO(line))
        row = next(reader)

        if len(row) > VICTIM_AGE_INDEX:
            return (
                row[ID_INDEX] if row[ID_INDEX] else None,
                row[CRIME_DESC_INDEX] if row[CRIME_DESC_INDEX] else None,
                row[VICTIM_AGE_INDEX] if row[VICTIM_AGE_INDEX] else None
            )
    except:
        return ()
    return ()

crimes_parsed = crimes_raw.map(parse_crimes)
#print(crimes_parsed.take(10))
    
rdd_start = time.time()

result_rdd = crimes_parsed.filter(lambda x: "AGGRAVATED ASSAULT" in x[1]) \
                .map(lambda x: (get_age_group(x[2]), 1)) \
                .reduceByKey(lambda x, y: x+y) \
                .sortBy(lambda x: x[1], ascending=False)

print(result_rdd.collect())

rdd_end = time.time()
rdd_time = rdd_end - rdd_start
print(rdd_time)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('Adults', 121660), ('Young Adults', 33758), ('Children', 10904), ('Seniors', 6011), ('Invalid', 5110)]
16.912631273269653

Method | Time(s)
-------|--------
DF-no-UDF | 7.238358020782471
DF-UDF | 8.533761262893677
RDD | 16.912631273269653

