In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4",
        "spark.driver.memory": "2g"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
255,application_1738075734771_0256,pyspark,idle,Link,Link,,
270,application_1738075734771_0271,pyspark,idle,Link,Link,,
284,application_1738075734771_0285,pyspark,idle,Link,Link,,
285,application_1738075734771_0286,pyspark,idle,Link,Link,,
300,application_1738075734771_0301,pyspark,idle,Link,Link,,
326,application_1738075734771_0327,pyspark,idle,Link,Link,,
330,application_1738075734771_0331,pyspark,idle,Link,Link,,
332,application_1738075734771_0333,pyspark,idle,Link,Link,,
334,application_1738075734771_0335,pyspark,idle,Link,Link,,
343,application_1738075734771_0344,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, min
from sedona.spark import *
import time

# Δημιουργία SparkSession
spark = SparkSession.builder \
    .appName("Query 5") \
    .getOrCreate()

# Sedona Context για GeoJSON δεδομένα
sedona = SedonaContext.create(spark)

# Μέτρηση χρόνου
start_time = time.time()

# Φόρτωση δεδομένων
crime_2010_2019_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_2020_present_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
police_stations_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

crime_df_2010_2019 = spark.read.csv(crime_2010_2019_path, header=True, inferSchema=True)
crime_df_2020_present = spark.read.csv(crime_2020_present_path, header=True, inferSchema=True)
crime_df = crime_df_2010_2019.union(crime_df_2020_present)
police_stations_df = spark.read.csv(police_stations_path, header=True, inferSchema=True)

# Δημιουργία γεωμετρικών αντικειμένων
crime_data = crime_df.filter(col("LAT").isNotNull() & col("LON").isNotNull()) \
    .withColumn("crime_geometry", ST_Point(col("LON"), col("LAT")))
police_stations = police_stations_df.withColumn("station_geometry", ST_Point(col("X"), col("Y")))

# Υπολογισμός αποστάσεων για κάθε έγκλημα και αστυνομικό τμήμα
crime_with_distances = crime_data.join(police_stations) \
    .withColumn("distance", ST_DistanceSphere(col("crime_geometry"), col("station_geometry"))/1000) 

# Υπολογισμός της ελάχιστης απόστασης για κάθε έγκλημα
min_distances = crime_with_distances.groupBy("DR_NO") \
    .agg(min("distance").alias("min_distance"))

# Μετονομασία της στήλης DR_NO στο crime_with_distances
crime_with_distances = crime_with_distances.withColumnRenamed("DR_NO", "crime_DR_NO")

# Join για να βρούμε το πλησιέστερο αστυνομικό τμήμα
crime_with_nearest_station = min_distances.join(
    crime_with_distances,
    (min_distances["DR_NO"] == crime_with_distances["crime_DR_NO"]) &
    (min_distances["min_distance"] == crime_with_distances["distance"])
).select(
    col("DR_NO").alias("crime_id"),
    col("min_distance"),
    col("DIVISION").alias("division")
)

# Υπολογισμός του αριθμού εγκλημάτων και της μέσης απόστασης ανά τμήμα
result = crime_with_nearest_station.groupBy("division") \
    .agg(
        avg("min_distance").alias("average_distance"),
        count("crime_id").alias("#")
    ) \
    .orderBy("#", ascending=False)

# Εμφάνιση αποτελεσμάτων
result.show(21)

end_time = time.time()

# Εμφάνιση χρόνου εκτέλεσης
print(f"Configuration 1 (2 executors * 4 cores/8GB memory) Execution Time: {(end_time - start_time):.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
353,application_1738075734771_0354,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+------+
|        division|  average_distance|     #|
+----------------+------------------+------+
|       HOLLYWOOD| 2.076263960178723|224340|
|        VAN NUYS| 2.953369742819782|210134|
|       SOUTHWEST|2.1913988057808855|188901|
|        WILSHIRE| 2.592665532978777|185996|
|     77TH STREET|1.7165449719701003|171827|
|         OLYMPIC|1.7236036971780941|170897|
| NORTH HOLLYWOOD| 2.643006094141568|167854|
|         PACIFIC|3.8500706553079005|161359|
|         CENTRAL|0.9924764374568917|153871|
|         RAMPART|1.5345341879190049|152736|
|       SOUTHEAST|2.4218662158881825|152176|
|     WEST VALLEY|3.0356712163140793|138643|
|         TOPANGA|3.2969548417555568|138217|
|        FOOTHILL| 4.250921708424992|134896|
|          HARBOR|3.7025615993565073|126747|
|      HOLLENBECK| 366.9213044614892|119294|
|WEST LOS ANGELES|2.7924572890341137|115781|
|          NEWTON|1.6346357397097446|111110|
|       NORTHEAST|3.6236655246040765|108109|
|         

In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2",
        "spark.driver.memory": "2g"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
255,application_1738075734771_0256,pyspark,idle,Link,Link,,
270,application_1738075734771_0271,pyspark,idle,Link,Link,,
284,application_1738075734771_0285,pyspark,idle,Link,Link,,
285,application_1738075734771_0286,pyspark,idle,Link,Link,,
300,application_1738075734771_0301,pyspark,idle,Link,Link,,
326,application_1738075734771_0327,pyspark,idle,Link,Link,,
330,application_1738075734771_0331,pyspark,idle,Link,Link,,
332,application_1738075734771_0333,pyspark,idle,Link,Link,,
334,application_1738075734771_0335,pyspark,idle,Link,Link,,
343,application_1738075734771_0344,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, min
from sedona.spark import *
import time

# Δημιουργία SparkSession
spark = SparkSession.builder \
    .appName("Query 5") \
    .getOrCreate()

# Sedona Context για GeoJSON δεδομένα
sedona = SedonaContext.create(spark)

# Μέτρηση χρόνου
start_time = time.time()

# Φόρτωση δεδομένων
crime_2010_2019_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_2020_present_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
police_stations_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

crime_df_2010_2019 = spark.read.csv(crime_2010_2019_path, header=True, inferSchema=True)
crime_df_2020_present = spark.read.csv(crime_2020_present_path, header=True, inferSchema=True)
crime_df = crime_df_2010_2019.union(crime_df_2020_present)
police_stations_df = spark.read.csv(police_stations_path, header=True, inferSchema=True)

# Δημιουργία γεωμετρικών αντικειμένων
crime_data = crime_df.filter(col("LAT").isNotNull() & col("LON").isNotNull()) \
    .withColumn("crime_geometry", ST_Point(col("LON"), col("LAT")))
police_stations = police_stations_df.withColumn("station_geometry", ST_Point(col("X"), col("Y")))

# Υπολογισμός αποστάσεων για κάθε έγκλημα και αστυνομικό τμήμα
crime_with_distances = crime_data.join(police_stations) \
    .withColumn("distance", ST_DistanceSphere(col("crime_geometry"), col("station_geometry"))/1000) 

# Υπολογισμός της ελάχιστης απόστασης για κάθε έγκλημα
min_distances = crime_with_distances.groupBy("DR_NO") \
    .agg(min("distance").alias("min_distance"))

# Μετονομασία της στήλης DR_NO στο crime_with_distances
crime_with_distances = crime_with_distances.withColumnRenamed("DR_NO", "crime_DR_NO")

# Join για να βρούμε το πλησιέστερο αστυνομικό τμήμα
crime_with_nearest_station = min_distances.join(
    crime_with_distances,
    (min_distances["DR_NO"] == crime_with_distances["crime_DR_NO"]) &
    (min_distances["min_distance"] == crime_with_distances["distance"])
).select(
    col("DR_NO").alias("crime_id"),
    col("min_distance"),
    col("DIVISION").alias("division")
)

# Υπολογισμός του αριθμού εγκλημάτων και της μέσης απόστασης ανά τμήμα
result = crime_with_nearest_station.groupBy("division") \
    .agg(
        avg("min_distance").alias("average_distance"),
        count("crime_id").alias("#")
    ) \
    .orderBy("#", ascending=False)

# Εμφάνιση αποτελεσμάτων
result.show(21)

end_time = time.time()

# Εμφάνιση χρόνου εκτέλεσης
print(f"Configuration 2 (4 executors * 2 cores/4GB memory) Execution Time: {(end_time - start_time):.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
348,application_1738075734771_0349,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+------+
|        division|  average_distance|     #|
+----------------+------------------+------+
|       HOLLYWOOD| 2.076263960178723|224340|
|        VAN NUYS|2.9533697428197825|210134|
|       SOUTHWEST| 2.191398805780886|188901|
|        WILSHIRE| 2.592665532978777|185996|
|     77TH STREET|1.7165449719701007|171827|
|         OLYMPIC|1.7236036971780941|170897|
| NORTH HOLLYWOOD|2.6430060941415667|167854|
|         PACIFIC|3.8500706553079014|161359|
|         CENTRAL|0.9924764374568917|153871|
|         RAMPART| 1.534534187919005|152736|
|       SOUTHEAST| 2.421866215888182|152176|
|     WEST VALLEY|3.0356712163140815|138643|
|         TOPANGA|3.2969548417555563|138217|
|        FOOTHILL| 4.250921708424992|134896|
|          HARBOR|3.7025615993565073|126747|
|      HOLLENBECK| 366.9213044614893|119294|
|WEST LOS ANGELES| 2.792457289034113|115781|
|          NEWTON|1.6346357397097453|111110|
|       NORTHEAST|3.6236655246040748|108109|
|         

In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "8",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1",
        "spark.driver.memory": "2g"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
255,application_1738075734771_0256,pyspark,idle,Link,Link,,
270,application_1738075734771_0271,pyspark,idle,Link,Link,,
284,application_1738075734771_0285,pyspark,idle,Link,Link,,
285,application_1738075734771_0286,pyspark,idle,Link,Link,,
300,application_1738075734771_0301,pyspark,idle,Link,Link,,
326,application_1738075734771_0327,pyspark,idle,Link,Link,,
330,application_1738075734771_0331,pyspark,idle,Link,Link,,
332,application_1738075734771_0333,pyspark,idle,Link,Link,,
334,application_1738075734771_0335,pyspark,idle,Link,Link,,
343,application_1738075734771_0344,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, min
from sedona.spark import *
import time

# Δημιουργία SparkSession
spark = SparkSession.builder \
    .appName("Query 5") \
    .getOrCreate()

# Sedona Context για GeoJSON δεδομένα
sedona = SedonaContext.create(spark)

# Μέτρηση χρόνου
start_time = time.time()

# Φόρτωση δεδομένων
crime_2010_2019_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_2020_present_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
police_stations_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

crime_df_2010_2019 = spark.read.csv(crime_2010_2019_path, header=True, inferSchema=True)
crime_df_2020_present = spark.read.csv(crime_2020_present_path, header=True, inferSchema=True)
crime_df = crime_df_2010_2019.union(crime_df_2020_present)
police_stations_df = spark.read.csv(police_stations_path, header=True, inferSchema=True)

# Δημιουργία γεωμετρικών αντικειμένων
crime_data = crime_df.filter(col("LAT").isNotNull() & col("LON").isNotNull()) \
    .withColumn("crime_geometry", ST_Point(col("LON"), col("LAT")))
police_stations = police_stations_df.withColumn("station_geometry", ST_Point(col("X"), col("Y")))

# Υπολογισμός αποστάσεων για κάθε έγκλημα και αστυνομικό τμήμα
crime_with_distances = crime_data.join(police_stations) \
    .withColumn("distance", ST_DistanceSphere(col("crime_geometry"), col("station_geometry"))/1000) 

# Υπολογισμός της ελάχιστης απόστασης για κάθε έγκλημα
min_distances = crime_with_distances.groupBy("DR_NO") \
    .agg(min("distance").alias("min_distance"))

# Μετονομασία της στήλης DR_NO στο crime_with_distances
crime_with_distances = crime_with_distances.withColumnRenamed("DR_NO", "crime_DR_NO")

# Join για να βρούμε το πλησιέστερο αστυνομικό τμήμα
crime_with_nearest_station = min_distances.join(
    crime_with_distances,
    (min_distances["DR_NO"] == crime_with_distances["crime_DR_NO"]) &
    (min_distances["min_distance"] == crime_with_distances["distance"])
).select(
    col("DR_NO").alias("crime_id"),
    col("min_distance"),
    col("DIVISION").alias("division")
)

# Υπολογισμός του αριθμού εγκλημάτων και της μέσης απόστασης ανά τμήμα
result = crime_with_nearest_station.groupBy("division") \
    .agg(
        avg("min_distance").alias("average_distance"),
        count("crime_id").alias("#")
    ) \
    .orderBy("#", ascending=False)

# Εμφάνιση αποτελεσμάτων
result.show(21)

end_time = time.time()

# Εμφάνιση χρόνου εκτέλεσης
print(f"Configuration 3 (8 executors * 1 cores/2GB memory) Execution Time: {(end_time - start_time):.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
351,application_1738075734771_0352,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+------+
|        division|  average_distance|     #|
+----------------+------------------+------+
|       HOLLYWOOD|2.0762639601787227|224340|
|        VAN NUYS|2.9533697428197816|210134|
|       SOUTHWEST| 2.191398805780886|188901|
|        WILSHIRE| 2.592665532978776|185996|
|     77TH STREET|   1.7165449719701|171827|
|         OLYMPIC|1.7236036971780937|170897|
| NORTH HOLLYWOOD|2.6430060941415694|167854|
|         PACIFIC| 3.850070655307899|161359|
|         CENTRAL|0.9924764374568924|153871|
|         RAMPART|1.5345341879190053|152736|
|       SOUTHEAST| 2.421866215888183|152176|
|     WEST VALLEY|3.0356712163140815|138643|
|         TOPANGA| 3.296954841755555|138217|
|        FOOTHILL| 4.250921708424992|134896|
|          HARBOR|3.7025615993565077|126747|
|      HOLLENBECK| 366.9213044614894|119294|
|WEST LOS ANGELES|2.7924572890341124|115781|
|          NEWTON|1.6346357397097449|111110|
|       NORTHEAST|3.6236655246040748|108109|
|         