In [1]:
import pandas as pd
import time
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lower, count

#Αρχικοποίηση SparkSession με 4 executors
spark = SparkSession.builder \
    .appName("Query 1 - Age Sorting") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

#Φόρτωση των δεδομένων από αρχεία CSV
file_path = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv' 
file_path2 = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv'
data1 = spark.read.csv(file_path, header=True, inferSchema=True)
data2 = spark.read.csv(file_path2, header=True, inferSchema=True)

#Ένωση των δύο συνόλων δεδομένων
data = data1.union(data2)

#Ορισμός ηλικιακών ομάδων
data = data.withColumn(
    "Age_Group",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Elderly")
)

#Φιλτράρισμα για περιστατικά "aggravated assault"
filtered_data = data.filter(lower(col("Crm Cd Desc")).contains("aggravated assault"))

#Χρόνος εκτέλεσης για DataFrame API
start_time_df = time.time()

#Ομαδοποίηση κατά ηλικιακή ομάδα και μέτρηση
result_df = filtered_data.groupBy("Age_Group").agg(count("*").alias("Incident_Count"))

#Ταξινόμηση κατά Incident_Count σε φθίνουσα σειρά
sorted_result = result_df.orderBy(col("Incident_Count").desc())

#Εμφάνιση αποτελεσμάτων από DataFrame API
sorted_result.show()

end_time_df = time.time()
execution_time_df = end_time_df - start_time_df
print(f"Execution Time DataFrame API: {execution_time_df:.4f} seconds")

#Χρόνος εκτέλεσης για RDD API
start_time_rdd = time.time()

rdd = filtered_data.rdd.map(lambda row: (row["Age_Group"], 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .sortBy(lambda x: x[1], ascending=False)

#Εμφάνιση αποτελεσμάτων από RDD API
print(rdd.collect())

end_time_rdd = time.time()
execution_time_rdd = end_time_rdd - start_time_rdd
print(f"Execution Time RDD API: {execution_time_rdd:.4f} seconds")

#Σύγκριση χρόνων
if execution_time_df < execution_time_rdd:
    print("DataFrame API was faster.")
else:
    print("RDD API was faster.")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1538,application_1732639283265_1499,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+--------------+
|   Age_Group|Incident_Count|
+------------+--------------+
|      Adults|        121093|
|Young Adults|         33605|
|    Children|         15928|
|     Elderly|          5985|
+------------+--------------+

Execution Time DataFrame API: 5.0830 seconds
[('Adults', 121093), ('Young Adults', 33605), ('Children', 15928), ('Elderly', 5985)]
Execution Time RDD API: 12.9648 seconds
DataFrame API was faster.