In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import time

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisQuery5") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", 4) \
    .getOrCreate()
    # .config("spark.driver.memory", "4g") \

# Start timing
start_time = time.time()
# Load Police Stations Data (CSV format)
police_station_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"
police_df = spark.read.csv(police_station_data_path, header=True, inferSchema=True)

# Create a geometry column for police stations using ST_Point
police_df = police_df.withColumn("station_point", F.expr("ST_Point(X, Y)"))

crime_data_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"
crime_df = spark.read.parquet(crime_data_path)

# Create geometry point from latitude and longitude
crime_df = crime_df.withColumn("crime_point", F.expr("ST_Point(LON, LAT)"))


# Add unique ID to each crime record
crime_df = crime_df.withColumn("crime_id", F.row_number().over(Window.orderBy(F.lit(1))))

# Cross join crime and police station data
cross_df = crime_df.crossJoin(police_df)

# Calculate distance
cross_df = cross_df.withColumn("distance", F.expr("ST_DistanceSphere(crime_point, station_point)/1000"))

# Find the nearest police station for each crime
window_spec = Window.partitionBy("crime_id").orderBy("distance")
nearest_station_df = cross_df.withColumn("rank", F.row_number().over(window_spec)).filter(F.col("rank") == 1)

results_df = nearest_station_df.groupBy("DIVISION").agg(
    F.count("*").alias("crime_count"),
    F.avg("distance").alias("average_distance")
).orderBy(F.col("crime_count").desc())

results_df.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1943,application_1732639283265_1904,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+-----------+------------------+
|        DIVISION|crime_count|  average_distance|
+----------------+-----------+------------------+
|       HOLLYWOOD|     224340| 2.076263960178773|
|        VAN NUYS|     210134|2.9533697428196213|
|       SOUTHWEST|     188901|2.1913988057806097|
|        WILSHIRE|     185996|2.5926655329788635|
|     77TH STREET|     171827|1.7165449719700194|
|         OLYMPIC|     170897| 1.723603697177835|
| NORTH HOLLYWOOD|     167854| 2.643006094141679|
|         PACIFIC|     161359|  3.85007065530769|
|         CENTRAL|     153871| 0.992476437456772|
|         RAMPART|     152736|1.5345341879189556|
|       SOUTHEAST|     152176|2.4218662158882407|
|     WEST VALLEY|     138643|3.0356712163141375|
|         TOPANGA|     138217| 3.296954841755896|
|        FOOTHILL|     134896| 4.250921708425352|
|          HARBOR|     126747|3.7025615993562746|
|      HOLLENBECK|     119294|366.92130446149946|
|WEST LOS ANGELES|     115781| 2.792457289033499|


In [2]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import time

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisQuery5") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .config("spark.executor.instances", 4) \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 2) \
    .getOrCreate()
    # .config("spark.driver.memory", "4g") \

# Start timing
start_time = time.time()
# Load Police Stations Data (CSV format)
police_station_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"
police_df = spark.read.csv(police_station_data_path, header=True, inferSchema=True)

# Create a geometry column for police stations using ST_Point
police_df = police_df.withColumn("station_point", F.expr("ST_Point(X, Y)"))

crime_data_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"
crime_df = spark.read.parquet(crime_data_path)

# Create geometry point from latitude and longitude
crime_df = crime_df.withColumn("crime_point", F.expr("ST_Point(LON, LAT)"))


# Add unique ID to each crime record
crime_df = crime_df.withColumn("crime_id", F.row_number().over(Window.orderBy(F.lit(1))))

# Cross join crime and police station data
cross_df = crime_df.crossJoin(police_df)

# Calculate distance
cross_df = cross_df.withColumn("distance", F.expr("ST_DistanceSphere(crime_point, station_point)/1000"))

# Find the nearest police station for each crime
window_spec = Window.partitionBy("crime_id").orderBy("distance")
nearest_station_df = cross_df.withColumn("rank", F.row_number().over(window_spec)).filter(F.col("rank") == 1)

results_df = nearest_station_df.groupBy("DIVISION").agg(
    F.count("*").alias("crime_count"),
    F.avg("distance").alias("average_distance")
).orderBy(F.col("crime_count").desc())

results_df.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+-----------+------------------+
|        DIVISION|crime_count|  average_distance|
+----------------+-----------+------------------+
|       HOLLYWOOD|     224340| 2.076263960178773|
|        VAN NUYS|     210134|2.9533697428196213|
|       SOUTHWEST|     188901|2.1913988057806097|
|        WILSHIRE|     185996|2.5926655329788635|
|     77TH STREET|     171827|1.7165449719700194|
|         OLYMPIC|     170897| 1.723603697177835|
| NORTH HOLLYWOOD|     167854| 2.643006094141679|
|         PACIFIC|     161359|  3.85007065530769|
|         CENTRAL|     153871| 0.992476437456772|
|         RAMPART|     152736|1.5345341879189556|
|       SOUTHEAST|     152176|2.4218662158882407|
|     WEST VALLEY|     138643|3.0356712163141375|
|         TOPANGA|     138217| 3.296954841755896|
|        FOOTHILL|     134896| 4.250921708425352|
|          HARBOR|     126747|3.7025615993562746|
|      HOLLENBECK|     119294|366.92130446149946|
|WEST LOS ANGELES|     115781| 2.792457289033499|


In [3]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import time

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisQuery5") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .config("spark.executor.instances", 8) \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", 1) \
    .getOrCreate()
    # .config("spark.driver.memory", "4g") \

# Start timing
start_time = time.time()
# Load Police Stations Data (CSV format)
police_station_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"
police_df = spark.read.csv(police_station_data_path, header=True, inferSchema=True)

# Create a geometry column for police stations using ST_Point
police_df = police_df.withColumn("station_point", F.expr("ST_Point(X, Y)"))

crime_data_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"
crime_df = spark.read.parquet(crime_data_path)

# Create geometry point from latitude and longitude
crime_df = crime_df.withColumn("crime_point", F.expr("ST_Point(LON, LAT)"))


# Add unique ID to each crime record
crime_df = crime_df.withColumn("crime_id", F.row_number().over(Window.orderBy(F.lit(1))))

# Cross join crime and police station data
cross_df = crime_df.crossJoin(police_df)

# Calculate distance
cross_df = cross_df.withColumn("distance", F.expr("ST_DistanceSphere(crime_point, station_point)/1000"))

# Find the nearest police station for each crime
window_spec = Window.partitionBy("crime_id").orderBy("distance")
nearest_station_df = cross_df.withColumn("rank", F.row_number().over(window_spec)).filter(F.col("rank") == 1)

results_df = nearest_station_df.groupBy("DIVISION").agg(
    F.count("*").alias("crime_count"),
    F.avg("distance").alias("average_distance")
).orderBy(F.col("crime_count").desc())

results_df.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+-----------+------------------+
|        DIVISION|crime_count|  average_distance|
+----------------+-----------+------------------+
|       HOLLYWOOD|     224340| 2.076263960178773|
|        VAN NUYS|     210134|2.9533697428196213|
|       SOUTHWEST|     188901|2.1913988057806097|
|        WILSHIRE|     185996|2.5926655329788635|
|     77TH STREET|     171827|1.7165449719700194|
|         OLYMPIC|     170897| 1.723603697177835|
| NORTH HOLLYWOOD|     167854| 2.643006094141679|
|         PACIFIC|     161359|  3.85007065530769|
|         CENTRAL|     153871| 0.992476437456772|
|         RAMPART|     152736|1.5345341879189556|
|       SOUTHEAST|     152176|2.4218662158882407|
|     WEST VALLEY|     138643|3.0356712163141375|
|         TOPANGA|     138217| 3.296954841755896|
|        FOOTHILL|     134896| 4.250921708425352|
|          HARBOR|     126747|3.7025615993562746|
|      HOLLENBECK|     119294|366.92130446149946|
|WEST LOS ANGELES|     115781| 2.792457289033499|


In [None]:
# The results indicate that increasing parallelism (more executors) slightly improves the runtime, but the differences are not substantial. Here's a breakdown of the observations:

# ---

# ### **1. Interpretation of the Results**
# - **2 executors × 4 cores/8GB memory:**
#   - Total Resources: 8 cores, 16GB memory.
#   - Time: 103.90 seconds.
#   - Performance: This configuration had fewer executors but more resources per executor, resulting in fewer parallel tasks.

# - **4 executors × 2 cores/4GB memory:**
#   - Total Resources: 8 cores, 16GB memory.
#   - Time: 101.33 seconds.
#   - Performance: Increasing the number of executors reduced the workload per executor, leading to better utilization of parallelism.

# - **8 executors × 1 core/2GB memory:**
#   - Total Resources: 8 cores, 16GB memory.
#   - Time: 99.94 seconds.
#   - Performance: Further increasing the number of executors maximized parallelism and task distribution, resulting in the best time.

# ---

# ### **2. Key Insights**
# 1. **Marginal Improvements:**  
#    - The runtime improvements are marginal because the workload might not fully benefit from higher levels of parallelism. This suggests the task's bottleneck could be in I/O operations, data shuffling, or serialization/deserialization rather than computation.

# 2. **Overhead of More Executors:**  
#    - Adding more executors introduces some overhead in managing communication between them. However, the 8-executor configuration balanced this overhead with better task distribution, resulting in the fastest runtime.

# 3. **Resource Allocation Trade-offs:**  
#    - More cores per executor (2 executors × 4 cores) provide higher computational power per task, but it limits the number of tasks running in parallel.
#    - Fewer cores per executor (8 executors × 1 core) increase the number of parallel tasks, which is beneficial if the workload can be split efficiently.

# ---

# ### **3. Recommendations**
# - **Optimal Configuration:** Based on these results, the **8 executors × 1 core/2GB memory** setup is optimal for this query as it achieves the fastest time.
# - **Further Tuning:**
#   - If possible, reduce the data size being shuffled by optimizing the data partitioning strategy.
#   - Experiment with Spark's default parallelism (`spark.default.parallelism`) or the number of shuffle partitions (`spark.sql.shuffle.partitions`) to further improve performance.
# - **Monitor Resource Usage:** Use Spark's UI to monitor resource utilization (CPU, memory, disk, network). If utilization is low, further tuning may be necessary.

# By balancing parallelism with resource allocation, you can achieve the best performance for workloads like this one.