In [1]:
%%configure -f
{
    "driverMemory": "2G",
    "executorMemory": "2G",
    "executorCores": 1,
    "numExecutors": 4
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1136,application_1765289937462_1129,pyspark,idle,Link,Link,,
1192,application_1765289937462_1185,pyspark,idle,Link,Link,,
1196,application_1765289937462_1189,pyspark,idle,Link,Link,,
1197,application_1765289937462_1190,pyspark,idle,Link,Link,,
1198,application_1765289937462_1191,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
import time
from pyspark.sql.functions import col, desc, explode, split

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

print(f"Executors: {sc.getConf().get('spark.executor.instances')}")
print(f"Master: {sc.master}")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1199,application_1765289937462_1192,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executors: 4
Master: yarn

In [3]:
path_crimes_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
path_crimes_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
path_mocodes = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"

df_crimes = spark.read.option("header", "true")\
    .csv([path_crimes_1, path_crimes_2])

mo_codes = spark.read.text(path_mocodes) \
                .select(split(col("value"), " ", 2).alias("parts")) \
                .select(
                    col("parts")[0].alias("code"),
                    col("parts")[1].alias("description")
                )

df_crimes.rdd.count()
mo_codes.rdd.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

615

In [4]:
crime_counts = df_crimes.filter(col("Mocodes").isNotNull()) \
                        .select(explode(split(col("Mocodes"), " ")).alias("code")) \
                        .filter(col("code") != "") \
                        .groupBy("code") \
                        .count()

crime_counts.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+
|code| count|
+----+------+
|0543|   221|
|0401| 13049|
|1280|    29|
|0201|   608|
|1008|  1071|
|0371| 15199|
|0385| 41926|
|0908|  1837|
|0535|    82|
|0514|  1034|
|1805|   910|
|1314|   449|
|1300|219082|
|0394| 25369|
|0409|  1408|
|1501|115589|
|1013|   302|
|1405|   280|
|1303|    81|
|0379|   294|
+----+------+
only showing top 20 rows

In [5]:
# times = []
# strategies = ["BROADCAST", "MERGE", "SHUFFLE_HASH", "SHUFFLE_REPLICATE_NL"]

# for strategy in strategies:
#     joined_df = crime_counts.join(mo_codes.hint(strategy), "code") \
#         .orderBy(desc("count"))

#     print(f"\n--- Plan for {strategy} ---")
#     joined_df.explain()

#     start = time.time()
#     joined_df.rdd.count()
#     end = time.time()

#     times.append(end - start)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
times = []
strategies = [
    "BROADCAST",
    "MERGE",
    "SHUFFLE_HASH",
    "SHUFFLE_REPLICATE_NL"
]

for strategy in strategies:

    # Create fresh DataFrames for this strategy
    left_df = crime_counts
    right_df = mo_codes.hint(strategy)

    joined_df = (
        left_df
        .join(right_df, "code")
        .orderBy(desc("count"))
    )

    print(f"\n--- Plan for {strategy} ---")
    joined_df.explain()

    start_time = time.time()
    joined_df.foreach(lambda _: None)
    end_time = time.time()

    times.append(end_time - start_time)

    joined_df.unpersist(blocking=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


--- Plan for BROADCAST ---
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#129L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#129L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=6898]
      +- Project [code#125, count#129L, description#91]
         +- BroadcastHashJoin [code#125], [code#90], Inner, BuildRight, false
            :- HashAggregate(keys=[code#125], functions=[count(1)], schema specialized)
            :  +- Exchange hashpartitioning(code#125, 1000), ENSURE_REQUIREMENTS, [plan_id=6891]
            :     +- HashAggregate(keys=[code#125], functions=[partial_count(1)], schema specialized)
            :        +- Filter NOT (code#125 = )
            :           +- Generate explode(split(Mocodes#39,  , -1)), false, [code#125]
            :              +- Filter isnotnull(Mocodes#39)
            :                 +- FileScan csv [Mocodes#39] Batched: false, DataFilters: [isnotnull(Mocodes#39)], Format: CSV, Location: InMemoryFile

In [20]:
for i, strategy in enumerate(strategies):
    print(f"Strategy {strategy} : {times[i]} sec")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Strategy BROADCAST : 16.31821346282959 sec
Strategy MERGE : 9.290767669677734 sec
Strategy SHUFFLE_HASH : 7.342074632644653 sec
Strategy SHUFFLE_REPLICATE_NL : 5.557344436645508 sec

In [8]:
dataframe_mean = sum(times) / len(strategies)
print(f"mean time for dataframes {dataframe_mean}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
name 'times' is not defined
Traceback (most recent call last):
NameError: name 'times' is not defined

