In [1]:
# Fixed benchmarking approach

crimes_old_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
crimes_new_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"

crimes_old_df = spark.read.csv(crimes_old_path, header=True, inferSchema=True)
crimes_new_df = spark.read.csv(crimes_new_path, header=True, inferSchema=True)

times = {}
results = {}

# Helper function to run benchmark multiple times
def run_benchmark(name, func, warmup=1, runs=3):
    """Run benchmark with warmup and multiple iterations"""
    print(f"\n{'='*60}")
    print(f"Benchmarking: {name}")
    print(f"{'='*60}")
    
    # Warmup runs
    for i in range(warmup):
        print(f"Warmup run {i+1}/{warmup}...")
        spark.catalog.clearCache()
        func()
    
    # Actual benchmark runs
    run_times = []
    for i in range(runs):
        print(f"Benchmark run {i+1}/{runs}...")
        spark.catalog.clearCache()
        
        start_time = time.time()
        result = func()
        end_time = time.time()
        
        elapsed = end_time - start_time
        run_times.append(elapsed)
        print(f"  Time: {elapsed:.4f} sec")
    
    avg_time = sum(run_times) / len(run_times)
    print(f"\nAverage time: {avg_time:.4f} sec")
    print(f"Min time: {min(run_times):.4f} sec")
    print(f"Max time: {max(run_times):.4f} sec")
    
    return result, avg_time

# =============================================================================
# 1. Native DataFrame (without UDF)
# =============================================================================
def native_df_query():
    crimes_df = crimes_old_df.unionByName(crimes_new_df)
    assaults_df = crimes_df.filter(
        lower(col("Crm Cd Desc")).contains("aggravated assault")
    )
    
    assaults_grouped_df = assaults_df.withColumn(
        "Age_Group",
        when(col("Vict Age") < 18, "Children")
        .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
        .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
        .when(col("Vict Age") > 64, "Elderly")
    )
    
    result = assaults_grouped_df \
        .filter(col("Age_Group").isNotNull()) \
        .groupBy("Age_Group") \
        .count() \
        .orderBy(desc("count"))
    
    # IMPORTANT: Collect to fully materialize results
    return result.collect()

results['native_df'], times['native_df'] = run_benchmark(
    "Native DataFrame", 
    native_df_query,
    warmup=1,
    runs=3
)

# =============================================================================
# 2. DataFrame with UDF
# =============================================================================
def age_group(age):
    if age is None:
        return None
    if age < 18:
        return "Children"
    elif 18 <= age <= 24:
        return "Young Adults"
    elif 25 <= age <= 64:
        return "Adults"
    else:
        return "Elderly"

age_group_udf = udf(age_group, StringType())

def udf_query():
    crimes_df_udf = crimes_old_df.unionByName(crimes_new_df)
    assaults_df_udf = crimes_df_udf.filter(
        lower(col("Crm Cd Desc")).contains("aggravated assault")
    )
    
    udf_df = assaults_df_udf.withColumn(
        "Age_Group", 
        age_group_udf(col("Vict Age"))
    )
    
    result_udf = udf_df \
        .filter(col("Age_Group").isNotNull()) \
        .groupBy("Age_Group") \
        .count() \
        .orderBy(desc("count"))
    
    return result_udf.collect()

results['udf'], times['udf'] = run_benchmark(
    "DataFrame with UDF",
    udf_query,
    warmup=1,
    runs=3
)

# =============================================================================
# 3. RDD
# =============================================================================
def map_to_age_groups(row):
    age = row['Vict Age']
    if age is None:
        return None
    if age < 18:
        age_group = "Children"
    elif 18 <= age <= 24:
        age_group = "Young Adults"
    elif 25 <= age <= 64:
        age_group = "Adults"
    else:
        age_group = "Elderly"
    return (age_group, 1)

def rdd_query():
    crimes_old_rdd = crimes_old_df.rdd
    crimes_new_rdd = crimes_new_df.rdd
    crimes_rdd = crimes_old_rdd.union(crimes_new_rdd)
    
    assaults_rdd = crimes_rdd.filter(
        lambda row: row['Crm Cd Desc'] and 
                   'aggravated assault' in row['Crm Cd Desc'].lower()
    )
    
    counts_rdd = assaults_rdd \
        .map(map_to_age_groups) \
        .filter(lambda x: x is not None) \
        .reduceByKey(lambda a, b: a + b)
    
    sorted_rdd = counts_rdd.map(lambda x: (x[1], x[0])) \
                           .sortByKey(ascending=False)
    
    result_rdd = sorted_rdd.map(lambda x: (x[1], x[0]))
    
    return result_rdd.collect()

results['rdd'], times['rdd'] = run_benchmark(
    "RDD",
    rdd_query,
    warmup=1,
    runs=3
)

# =============================================================================
# Verification and Summary
# =============================================================================
print(f"\n{'='*60}")
print("RESULTS VERIFICATION")
print(f"{'='*60}")

# Convert results to sorted dictionaries for comparison
def to_dict(result_list):
    return {row['Age_Group'] if isinstance(row, dict) else row[0]: 
            row['count'] if isinstance(row, dict) else row[1] 
            for row in result_list}

native_dict = to_dict(results['native_df'])
udf_dict = to_dict(results['udf'])
rdd_dict = dict(results['rdd'])

print("\nNative DF:", native_dict)
print("UDF DF:   ", udf_dict)
print("RDD:      ", rdd_dict)

# Check if results match
if native_dict == udf_dict == rdd_dict:
    print("\n✓ All results match!")
else:
    print("\n✗ WARNING: Results do not match!")

print(f"\n{'='*60}")
print("PERFORMANCE SUMMARY")
print(f"{'='*60}")
print(f"Native DataFrame: {times['native_df']:.4f} sec")
print(f"DataFrame + UDF:  {times['udf']:.4f} sec")
print(f"RDD:              {times['rdd']:.4f} sec")

# Calculate speedup
baseline = times['native_df']
print(f"\nSpeedup vs Native DataFrame:")
print(f"  UDF: {baseline/times['udf']:.2f}x")
print(f"  RDD: {baseline/times['rdd']:.2f}x")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
397,application_1765289937462_0393,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
name 'lower' is not defined
Traceback (most recent call last):
  File "<stdin>", line 23, in run_benchmark
  File "<stdin>", line 52, in native_df_query
NameError: name 'lower' is not defined

