In [1]:
%%configure -f
{
    "driverMemory": "2G",
    "executorMemory": "2G",
    "executorCores": 1,
    "numExecutors": 4
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1119,application_1765289937462_1112,pyspark,idle,Link,Link,,
1123,application_1765289937462_1116,pyspark,idle,Link,Link,,
1125,application_1765289937462_1118,pyspark,idle,Link,Link,,
1128,application_1765289937462_1121,pyspark,idle,Link,Link,,
1132,application_1765289937462_1125,pyspark,idle,Link,Link,,
1133,application_1765289937462_1126,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql.functions import col, lower, when, desc, udf
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

print(f"Executors: {sc.getConf().get('spark.executor.instances')}")
print(f"Master: {sc.master}")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1134,application_1765289937462_1127,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executors: 4
Master: yarn

## Query 1
Να ταξινομημηθούν, σε φθίνουσα σειρά, οι ηλικιακές ομάδες των θυμάτων σε περιστατικά που περιλαμβάνουν οποιαδήποτε μορφή “βαριάς σωματικής βλάβης”. Θεωρείστε τις εξής ηλικιακές ομάδες:


• Παιδιά: < 18

• Νεαροί ενήλικοι: 18 – 24

• Ενήλικοι: 25 – 64

• Ηλικιωμένοι: >64


Ως εγκλήματα που περιλαμβάνουν οποιαδήποτε μορφή “βαριάς σωματικής βλάβης” θεωρούμε
όλα εκείνα τα περιστατικά που περιέχουν τον όρο “aggravated assault” στη σχετική περιγραφή.


In [3]:
crimes_old_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
crimes_new_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"

crimes_old_df = spark.read.csv(crimes_old_path, header=True, inferSchema=True)
crimes_new_df = spark.read.csv(crimes_new_path, header=True, inferSchema=True)

crimes_df = crimes_old_df.unionByName(crimes_new_df)
assaults_df = crimes_df.filter(lower(col("Crm Cd Desc"))
                                   .contains("aggravated assault"))

assaults_df.rdd.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

177443

In [4]:
def run_transformation():
    return assaults_df.withColumn(
        "Age_Group",
        when(col("Vict Age") < 18, "Children")
        .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
        .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
        .when(col("Vict Age") > 64, "Elderly")
    ).filter(col("Age_Group").isNotNull()) \
     .groupBy("Age_Group") \
     .count() \
     .orderBy(desc("count"))


run_transformation().rdd.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(Age_Group='Adults', count=121660), Row(Age_Group='Young Adults', count=33758), Row(Age_Group='Children', count=16014), Row(Age_Group='Elderly', count=6011)]

In [5]:
# Dataframes without UDFs
runs = 10
times = []

for i in range(runs):
    result = assaults_df.withColumn(
        "Age_Group",
        when(col("Vict Age") < 18, "Children")
        .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
        .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
        .when(col("Vict Age") > 64, "Elderly")
    ).filter(col("Age_Group").isNotNull()) \
     .groupBy("Age_Group") \
     .count() \
     .orderBy(desc("count"))

    start_time = time.time()
    result.collect()
    end_time = time.time()

    times.append(end_time - start_time)

    print(f"Run {i+1}: {times[i]} sec")

native_mean = sum(times)/runs
print(f"Mean time for native dataframe method: {native_mean} sec")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Run 1: 0.6069972515106201 sec
Run 2: 0.6571056842803955 sec
Run 3: 1.963552713394165 sec
Run 4: 0.7558772563934326 sec
Run 5: 0.9721484184265137 sec
Run 6: 0.502373456954956 sec
Run 7: 0.3817594051361084 sec
Run 8: 0.4387216567993164 sec
Run 9: 0.6803174018859863 sec
Run 10: 0.4013960361480713 sec
Mean time for native dataframe method: 0.7360249280929565 sec

In [6]:
# Dataframes with UDF

def age_group(age):
    if age is None:
        return None
    if age < 18:
        return "Children"
    elif 18 <= age <= 24:
        return "Young Adults"
    elif 25 <= age <= 64:
        return "Adults"
    else:
        return "Elderly"


times = []
age_group_udf = udf(age_group, StringType())


for i in range(10):
    udf_df = assaults_df.withColumn("Age_Group", age_group_udf(col("Vict Age")))

    result_udf = udf_df \
        .filter(col("Age_Group").isNotNull()) \
        .groupBy("Age_Group") \
        .count() \
        .orderBy(desc("count"))

    start_time = time.time()
    result_udf_count = result_udf.collect()
    end_time = time.time()
    times.append(end_time - start_time)

    print(f"Run {i+1}: {times[i]} sec")


udf_mean = sum(times)/runs
print(f"Mean time for dataframes with udfs method: {udf_mean} secs")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Run 1: 1.9806561470031738 sec
Run 2: 0.8777637481689453 sec
Run 3: 1.1075935363769531 sec
Run 4: 0.7527272701263428 sec
Run 5: 0.7340250015258789 sec
Run 6: 0.8239450454711914 sec
Run 7: 0.9063384532928467 sec
Run 8: 0.7470626831054688 sec
Run 9: 0.8510458469390869 sec
Run 10: 0.6796386241912842 sec
Mean time for dataframes with udfs method: 0.9460796356201172 secs