Ζητούμενο 1:

In [1]:
#  1: rdd api implementation

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("RDD query 1 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate() \

sc = spark.sparkContext
import time 

def find_age_group(age):
    try:
        age = int(age)
    except:
        return None
    if(age<18):
        return "Children"
    if(age<25):
        return "Young Adults"
    if(age<65):
        return "Adults"
    return "Seniors"

start_time = time.time()

crime_df1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",header=False)
crime_df2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",header=False)
crime_df = crime_df1.union(crime_df2)

crime_rdd = crime_df.rdd

sorted_age_rdd = crime_rdd.filter(lambda x: "aggravated assault" in x[9].lower()).map(lambda x: (find_age_group(x[11]),1)) \
    .reduceByKey(lambda x,y: x+y).sortBy(lambda value: value[1], ascending=False)

results = sorted_age_rdd.collect()
end_time = time.time()

for group,count in sorted_age_rdd.collect():
    print(f"{group}: {count}")
print(f"\nRDD implementation time: {end_time-start_time:.2f} seconds")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1675,application_1765289937462_1659,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Adults: 121660
Young Adults: 33758
Children: 16014
Seniors: 6011

RDD implementation time: 33.77 seconds

In [1]:
# Implementation 2: DataFrame API ( no UDF )
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, lower , when
import time

spark = SparkSession \
    .builder \
    .appName("Dataframe (no UDF) query 1 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate()

start_time = time.time()

crime_schema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date_Rptd", StringType(), True),
    StructField("DATE_OCC", StringType(), True),
    StructField("TIME_OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA_NAME", StringType(), True),
    StructField("Rpt_Dist_No", StringType(), True),
    StructField("Part_1_2", StringType(), True),
    StructField("Crm_Cd", StringType(), True),
    StructField("Crm_Cd_Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict Age", StringType(), True),
])

crime1_df= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
        header=False, schema= crime_schema)
crime2_df= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
        header=False, schema= crime_schema)
crime_df= crime1_df.union(crime2_df)

assault_df = crime_df.filter(lower(col("Crm_Cd_Desc")).contains("aggravated assault"))
assault_grouped_df = assault_df.withColumn(
    "Age Group",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Elders")
    .otherwise(None)  
)
age_group_counts = assault_grouped_df.groupBy("Age Group").count().orderBy(col("count").desc())
age_group_counts.show(truncate=False)

end_time = time.time()
print(f"\nDataFrame (no UDF) Implementation time: {end_time - start_time:.2f} seconds")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1745,application_1765289937462_1729,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|Age Group   |count |
+------------+------+
|Adults      |121660|
|Young Adults|33758 |
|Children    |16014 |
|Elders      |6011  |
+------------+------+


DataFrame (no UDF) Implementation time: 18.37 seconds

In [1]:
# Implementation 3: DataFrame API ( with UDF )
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, lower, udf
from pyspark.sql.functions import when

spark = SparkSession \
    .builder \
    .appName("Dataframe (with UDF) query 1 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate()

import time

def find_age_group(age):
    try:
        age = int(age)
    except:
        return None
    if(age<18):
        return "Παιδιά"
    if(age<25):
        return "Νεαροί ενήλικοι"
    if(age<65):
        return "Ενήλικοι"
    return "Ηλικωμένοι"

start_time = time.time()

crime_schema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date_Rptd", StringType(), True),
    StructField("DATE_OCC", StringType(), True),
    StructField("TIME_OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA_NAME", StringType(), True),
    StructField("Rpt_Dist_No", StringType(), True),
    StructField("Part_1_2", StringType(), True),
    StructField("Crm_Cd", StringType(), True),
    StructField("Crm_Cd_Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict Age", StringType(), True),
])

crime1_df= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
        header=False, schema= crime_schema)
crime2_df= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
        header=False, schema= crime_schema)
crime_df= crime1_df.union(crime2_df)

aggr_df = crime_df.filter(lower(col("Crm_Cd_Desc")).contains("aggravated assault"))

find_age_group_udf = udf(find_age_group, StringType())

aggr_grouped_df = aggr_df.withColumn("Age Group",find_age_group_udf(col("Vict Age")))
group_counts = aggr_grouped_df.groupBy("Age Group").count().orderBy(col("count").desc())
group_counts.show(truncate=False)

end_time = time.time()
print(f"\nDataframe (with UDF) implementation time: {end_time -start_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1682,application_1765289937462_1666,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+------+
|Age Group      |count |
+---------------+------+
|????????       |121660|
|?????? ????????|33758 |
|??????         |16014 |
|??????????     |6011  |
+---------------+------+


Dataframe (with UDF) implementation time: 23.44 seconds