In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
867,application_1761923966900_0879,pyspark,idle,Link,Link,,
870,application_1761923966900_0882,pyspark,idle,Link,Link,,
872,application_1761923966900_0884,pyspark,idle,Link,Link,,
873,application_1761923966900_0885,pyspark,idle,Link,Link,,
877,application_1761923966900_0889,pyspark,idle,Link,Link,,
880,application_1761923966900_0892,pyspark,idle,Link,Link,,
884,application_1761923966900_0896,pyspark,idle,Link,Link,,
900,application_1761923966900_0912,pyspark,idle,Link,Link,,
904,application_1761923966900_0916,pyspark,idle,Link,Link,,
908,application_1761923966900_0920,pyspark,idle,Link,Link,,


In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
from pyspark.sql.functions import col, when, count, desc, udf

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
CRIMES_PATH_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
CRIMES_PATH_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
spark = SparkSession.builder.appName("Query 1 DataFrame").getOrCreate()


def load_crime_data(path):
    data = spark.read.csv(path, header=True)
    df = data.select(
        col("DR_NO").cast("integer").alias("id"),
        col("Crm Cd Desc").alias("crime"),
        col("Vict Age").cast("integer").alias("victim_age")
    )
    return df


# We define the schema with only the columns we want to use in the requested queries
crimes_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Vict Age", IntegerType())
])

crimes1 = load_crime_data(CRIMES_PATH_1)

crimes2 = load_crime_data(CRIMES_PATH_2)

# Concatenate the two datasets
crimes = crimes1.union(crimes2)
crimes.printSchema()
print(crimes.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- id: integer (nullable = true)
 |-- crime: string (nullable = true)
 |-- victim_age: integer (nullable = true)

3138128

In [10]:
crimes.show(n=10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+----------+
|       id|               crime|victim_age|
+---------+--------------------+----------+
|  1307355|VIOLATION OF COUR...|        48|
| 11401303|VANDALISM - FELON...|         0|
| 70309629|OTHER MISCELLANEO...|         0|
| 90631215|VIOLATION OF COUR...|        47|
|100100501|     RAPE, ATTEMPTED|        47|
|100100506|SHOPLIFTING - PET...|        23|
|100100508|BURGLARY FROM VEH...|        46|
|100100509|ASSAULT WITH DEAD...|        51|
|100100510|ASSAULT WITH DEAD...|        30|
|100100511|THEFT-GRAND ($950...|        55|
+---------+--------------------+----------+
only showing top 10 rows

In [13]:
filtered_crimes = crimes.filter(
    col("crime").contains("AGGRAVATED ASSAULT")
)
#print(filtered_crimes.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

177443

In [18]:
# Query 1 DF with no UDF

groups_no_udf = filtered_crimes.withColumn("age_group",
    when(col("victim_age").isNull() | (col("victim_age") <= 0), "Invalid")
    .when(col("victim_age") < 18, "Children")
    .when(col("victim_age").between(18, 24), "Young adults")
    .when(col("victim_age").between(25, 64), "Adults")
    .when(col("victim_age") > 64, "Seniors")
    .otherwise("Invalid")
)

result = groups_no_udf.groupBy("age_group").agg(count("*").alias("victim_count")).orderBy(desc("victim_count"))
result.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------------+
|   age_group|victim_count|
+------------+------------+
|      Adults|      121660|
|Young adults|       33758|
|    Children|       10904|
|     Seniors|        6011|
|     Invalid|        5110|
+------------+------------+

In [20]:
# Query 1 DF with UDF

def get_age_group(age):
    if age is None or int(age) <= 0:
        return "Invalid"
    age = int(age)
    if age < 18:
        return "Children"
    elif age < 25:
        return "Young Adults"
    elif age < 65:
        return "Adults"
    elif age > 64:
        return "Seniors"
    else:
        return "Invalid"

age_udf = udf(get_age_group, StringType())

groups_udf = filtered_crimes.withColumn("age_group", age_udf(col("victim_age")))

result = groups_udf.groupBy("age_group").agg(count("*").alias("victim_count")).orderBy(desc("victim_count"))
result.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------------+
|   age_group|victim_count|
+------------+------------+
|      Adults|      121660|
|Young Adults|       33758|
|    Children|       10904|
|     Seniors|        6011|
|     Invalid|        5110|
+------------+------------+

In [30]:
# Query 1 with RDD

sc = spark.sparkContext

crimes1 = sc.textFile(CRIMES_PATH_1)
crimes2 = sc.textFile(CRIMES_PATH_2)
header = crimes1.first()
#print(header)

#col_names = header.split(",")
#for i in range(len(col_names)):
#    print(f"i = {i}: {col_names[i]}")

crimes_raw = crimes1.union(crimes2).filter(lambda line: line != header)
#print(crimes_raw.collect()[0].split("\""))

def full_commas(txt):
    res = True
    for i in range(len(txt)):
        if not (txt[i] == ','):
            return False
    return res

def line_to_correct_parts(line):
    raw_parts = line.split("\"")
    correct_parts = []
    for part in raw_parts:
        if not part:
            continue
        if part == ',':
            continue
        if full_commas(part):
            for _ in range(len(part)-1):
                correct_parts.append(None)
            continue
        
        correct_parts.append(part)
        
    return correct_parts

# 0: id
# 9: crime description
# 11: victim age
def parse_line(line):
    parts = line_to_correct_parts(line)  # line.split(",")

    if not (len(parts) == 28):
        print(len(parts), end=' ')
        return (None, None, None)
    
    if len(parts) > 11:
        elem_id = None
        if parts[0] is not None:
            try:
                elem_id = int(parts[0])
            except ValueError:
                pass
        elem_age = -1
        if parts[11] is not None:
            try:
                elem_age = int(parts[11])
            except ValueError:
                pass
        return (elem_id, parts[9], elem_age)
    return (None, None, None)

crimes_parsed = crimes_raw.map(parse_line)
crimes_filtered = crimes_parsed.filter(lambda x: "AGGRAVATED ASSAULT" in x[1])
age_group_rdd = crimes_filtered.map(lambda x: (get_age_group(x[2]), 1))

result_rdd = age_group_rdd.reduceByKey(lambda x, y: x+y).sortBy(lambda x: x[1], ascending=False)
print(result_rdd.collect())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('Adults', 121660), ('Young Adults', 33758), ('Children', 10904), ('Seniors', 6011), ('Invalid', 5110)]