In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "1g",
        "spark.executor.cores": "1",
        "spark.driver.memory": "2g"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
126,application_1738075734771_0127,pyspark,idle,Link,Link,,
129,application_1738075734771_0130,pyspark,idle,Link,Link,,
145,application_1738075734771_0146,pyspark,idle,Link,Link,,
148,application_1738075734771_0149,pyspark,idle,Link,Link,,
149,application_1738075734771_0150,pyspark,idle,Link,Link,,
155,application_1738075734771_0156,pyspark,idle,Link,Link,,
156,application_1738075734771_0157,pyspark,idle,Link,Link,,
161,application_1738075734771_0162,pyspark,idle,Link,Link,,
165,application_1738075734771_0166,pyspark,idle,Link,Link,,
170,application_1738075734771_0171,pyspark,idle,Link,Link,,


In [2]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count

# Δημιουργία SparkSession
spark = SparkSession.builder \
    .appName("Query1 - Dataframe API") \
    .getOrCreate()

# Μέτρηση χρόνου
start_time_df = time.time()

# Φόρτωση δεδομένων
crime_2010_2019_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_2020_present_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

crime_df_2010_2019 = spark.read.csv(crime_2010_2019_path, header=True, inferSchema=True)
crime_df_2020_present = spark.read.csv(crime_2020_present_path, header=True, inferSchema=True)
crime_df = crime_df_2010_2019.union(crime_df_2020_present)

# Φιλτράρισμα για περιστατικά “βαριάς σωματικής βλάβης”.
filtered_data = crime_df.filter(col("Crm Cd Desc").like("%AGGRAVATED ASSAULT%"))

# Δημιουργία ηλικιακών γκρουπ
age_grouped = filtered_data.withColumn(
    "Age_Group",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Seniors")
)

# Υπολογισμός αριθμού θυμάτων ανά γκρουπ
result_df = age_grouped.groupBy("Age_Group").agg(count("*").alias("Count"))

# Εμφάνιση αποτελεσμάτων σε φθίνουσα σειρά
sorted_result_df = result_df.orderBy(col("Count").desc())
sorted_result_df.show()

end_time_df = time.time()
df_execution_time = end_time_df - start_time_df
print(f"DataFrame API Execution Time: {df_execution_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
217,application_1738075734771_0218,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|   Age_Group| Count|
+------------+------+
|      Adults|121093|
|Young Adults| 33605|
|    Children| 15928|
|     Seniors|  5985|
+------------+------+

DataFrame API Execution Time: 42.64 seconds

In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "1g",
        "spark.executor.cores": "1",
        "spark.driver.memory": "2g"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
126,application_1738075734771_0127,pyspark,idle,Link,Link,,
129,application_1738075734771_0130,pyspark,idle,Link,Link,,
145,application_1738075734771_0146,pyspark,idle,Link,Link,,
148,application_1738075734771_0149,pyspark,idle,Link,Link,,
149,application_1738075734771_0150,pyspark,idle,Link,Link,,
155,application_1738075734771_0156,pyspark,idle,Link,Link,,
156,application_1738075734771_0157,pyspark,idle,Link,Link,,
161,application_1738075734771_0162,pyspark,idle,Link,Link,,
165,application_1738075734771_0166,pyspark,idle,Link,Link,,
170,application_1738075734771_0171,pyspark,idle,Link,Link,,


In [2]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count

# Δημιουργία SparkSession
spark = SparkSession.builder \
    .appName("Query1 - RDD API") \
    .getOrCreate()

# Μέτρηση χρόνου
start_time_rdd = time.time()

# Φόρτωση δεδομένων
crime_2010_2019_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_2020_present_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

crime_rdd_2010_2019 = spark.read.csv(crime_2010_2019_path).rdd
crime_rdd_2020_present = spark.read.csv(crime_2020_present_path).rdd
crime_rdd = crime_rdd_2010_2019.union(crime_rdd_2020_present)

# Φιλτράρισμα για περιστατικά "βαριάς σωματικής βλάβης"
filtered_rdd = crime_rdd.filter(lambda row: "aggravated assault" in row[9].lower())

# Δημιουργία ηλικιακών γκρουπ
def categorize_age(row):
    age = int(row[11])
    if age is None:
        return ("Unknown", 1)
    elif age < 18:
        return ("Children", 1)
    elif 18 <= age <= 24:
        return ("Young Adults", 1)
    elif 25 <= age <= 64:
        return ("Adults", 1)
    elif age > 64:
        return ("Seniors", 1)

categorized_rdd = filtered_rdd.map(categorize_age)

# Υπολογισμός αριθμού θυμάτων ανά γκρουπ
result_rdd = categorized_rdd.reduceByKey(lambda a, b: a + b)

# Εμφάνιση αποτελεσμάτων σε φθίνουσα σειρά
sorted_result_rdd = result_rdd.sortBy(lambda x: x[1], ascending=False)
for group, count in sorted_result_rdd.collect():
    print(f"{group}: {count}")
    
end_time_rdd = time.time()
rdd_execution_time = end_time_rdd - start_time_rdd
print(f"\nRDD API Execution Time: {rdd_execution_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
229,application_1738075734771_0230,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Adults: 121093
Young Adults: 33605
Children: 15928
Seniors: 5985

RDD API Execution Time: 36.45 seconds