In [1]:
from pyspark.sql import SparkSession, functions as F
from sedona.register.geo_registrator import SedonaRegistrator
from sedona.spark import SedonaContext
from pyspark.sql.types import StringType


import time

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisSpatialJoinSQL1") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .config("spark.executor.instances", 2)   \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", 1)  \
    .getOrCreate()
    # .config("spark.driver.memory", "4g") \


sedona = SedonaContext.create(spark)

# Start timing
start_time = time.time()
# Load datasets
census_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
crime_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"

# Load Census Data (GeoJSON format)
census_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(census_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

# Flatten the GeoJSON structure and filter valid populations
census_df = census_df.select(
    F.col("properties.ZCTA10").alias("ZCTA10"),
    F.col("properties.POP_2010").alias("Population"),
    F.col("properties.COMM").alias("COMM"),
    F.col("geometry").alias("geometry"),
    F.col("properties.HOUSING10").alias("HOUSING10"),
    F.col("properties.CITY").alias("CITY"),
# ).filter(F.col("Population") > 0)
).filter((F.col("CITY") == "Los Angeles") & (F.col("Population") > 0) & (F.col("HOUSING10") > 0))
# Load Income Data (CSV format)
income_df = spark.read.csv(income_path, header=True, inferSchema=True)
income_df = income_df.withColumn(
    "Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
).withColumnRenamed("Zip Code","Zip").drop("Estimated Median Income")

# Calculate total income per community (sum of income contributions per zip)
income_total = census_df.join(income_df, census_df.ZCTA10 == income_df.Zip, "inner") \
    .groupBy("COMM").agg(
        F.sum(F.col("Income") * F.col("HOUSING10")).alias("Total_Income")
    )

# Aggregate census data by community
census_agg = census_df.groupBy("COMM").agg(
    F.sum("Population").alias("Total_Population"),
    F.sum("HOUSING10").alias("Total_Households")
)

# Join census and income data
census_income = census_agg.join(income_total, "COMM", "inner")

# Calculate Mean Income Per Person
census_income = census_income.withColumn(
    "Mean_Income_Per_Person",
    F.col("Total_Income") / F.col("Total_Population")
)

# Step 1: Get the top 3 and bottom 3 communities by Mean Income Per Person
top_communities = census_income.orderBy(F.col("Mean_Income_Per_Person"), ascending=False).limit(3)
bottom_communities = census_income.orderBy(F.col("Mean_Income_Per_Person"), ascending=True).limit(3)

# Step 2: Filter crime data for top and bottom communities
top_community_codes = [row['COMM'] for row in top_communities.collect()]
bottom_community_codes = [row['COMM'] for row in bottom_communities.collect()]

# Load Crime Data (Parquet format)
crime_df = spark.read.parquet(crime_path)
# Convert DATE OCC to a timestamp format
crime_df = crime_df.withColumn("DATE_OCC_TS", F.to_timestamp("DATE OCC", "MM/dd/yyyy hh:mm:ss a"))

# Filter rows where the year is 2015
crime_df = crime_df.filter(F.year("DATE_OCC_TS") == 2015)

# Create geometry column using ST_Point
crime_df = crime_df.withColumn("geometry", F.expr("ST_Point(LON, LAT)"))

# Step 3: Join the crime data with census data to include 'COMM' (Community) info
crime_with_comm_df = crime_df.alias("cr").join(
    census_df.alias("c"),
    F.expr("ST_Within(cr.geometry, c.geometry)"),
    "inner"
)

# Filter crime data for top communities (those with highest income)
top_crime_df = crime_with_comm_df.filter(F.col("COMM").isin(top_community_codes))

# Filter crime data for bottom communities (those with lowest income)
bottom_crime_df = crime_with_comm_df.filter(F.col("COMM").isin(bottom_community_codes))

# Step 4: Load race codes dataset to translate 'Vict Descent'
race_codes_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
race_codes_df = spark.read.csv(race_codes_path, header=True, inferSchema=True)

# Join the crime data with the race codes to get the full description of 'Vict Descent'
top_crime_with_race_df = top_crime_df.join(race_codes_df, top_crime_df["Vict Descent"] == race_codes_df["Vict Descent"]) #, "left_outer")

# Aggregate by 'Vict Descent Full' and count victims for top communities
top_crime_race_count = top_crime_with_race_df.groupBy("Vict Descent Full").agg(
    F.count("*").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)

# Join the crime data with the race codes for bottom communities
bottom_crime_with_race_df = bottom_crime_df.join(race_codes_df, bottom_crime_df["Vict Descent"] == race_codes_df["Vict Descent"]) #, "left_outer")

# Aggregate by 'Vict Descent Full' and count victims for bottom communities
bottom_crime_race_count = bottom_crime_with_race_df.groupBy("Vict Descent Full").agg(
    F.count("*").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)


top_crime_race_count.show(truncate=False)
bottom_crime_race_count.show(truncate=False)

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1942,application_1732639283265_1903,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------------------+------------+
|Vict Descent Full             |Victim_Count|
+------------------------------+------------+
|White                         |649         |
|Other                         |72          |
|Hispanic/Latin/Mexican        |66          |
|Unknown                       |38          |
|Black                         |37          |
|Other Asian                   |21          |
|Chinese                       |1           |
|American Indian/Alaskan Native|1           |
+------------------------------+------------+

+------------------------------+------------+
|Vict Descent Full             |Victim_Count|
+------------------------------+------------+
|Hispanic/Latin/Mexican        |2815        |
|Black                         |761         |
|White                         |330         |
|Other                         |187         |
|Other Asian                   |113         |
|Unknown                       |22          |
|American Indian/Alaskan Native|2

In [6]:
from pyspark.sql import SparkSession, functions as F
from sedona.register.geo_registrator import SedonaRegistrator
from sedona.spark import SedonaContext
from pyspark.sql.types import StringType
import time

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisSpatialJoinSQL2") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .config("spark.executor.instances", 2)   \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 2)  \
    .getOrCreate()
    # .config("spark.driver.memory", "8g") \


sedona = SedonaContext.create(spark)
# Start timing
start_time = time.time()
# Load datasets
census_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
crime_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"

# Load Census Data (GeoJSON format)
census_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(census_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

# Flatten the GeoJSON structure and filter valid populations
census_df = census_df.select(
    F.col("properties.ZCTA10").alias("ZCTA10"),
    F.col("properties.POP_2010").alias("Population"),
    F.col("properties.COMM").alias("COMM"),
    F.col("geometry").alias("geometry"),
    F.col("properties.HOUSING10").alias("HOUSING10"),
).filter(F.col("Population") > 0)  # Exclude zero or negative population

# Load Income Data (CSV format)
income_df = spark.read.csv(income_path, header=True, inferSchema=True)
income_df = income_df.withColumn(
    "Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
).withColumnRenamed("Zip Code","Zip").drop("Estimated Median Income")

# Calculate total income per community (sum of income contributions per zip)
income_total = census_df.join(income_df, census_df.ZCTA10 == income_df.Zip, "inner") \
    .groupBy("COMM").agg(
        F.sum(F.col("Income") * F.col("HOUSING10")).alias("Total_Income")
    )

# Aggregate census data by community
census_agg = census_df.groupBy("COMM").agg(
    F.sum("Population").alias("Total_Population"),
    F.sum("HOUSING10").alias("Total_Households")
)

# Join census and income data
census_income = census_agg.join(income_total, "COMM", "inner")

# Calculate Mean Income Per Person
census_income = census_income.withColumn(
    "Mean_Income_Per_Person",
    F.col("Total_Income") / F.col("Total_Population")
)

# Step 1: Get the top 3 and bottom 3 communities by Mean Income Per Person
top_communities = census_income.orderBy(F.col("Mean_Income_Per_Person"), ascending=False).limit(3)
bottom_communities = census_income.orderBy(F.col("Mean_Income_Per_Person"), ascending=True).limit(3)

# Step 2: Filter crime data for top and bottom communities
top_community_codes = [row['COMM'] for row in top_communities.collect()]
bottom_community_codes = [row['COMM'] for row in bottom_communities.collect()]

# Load Crime Data (Parquet format)
crime_df = spark.read.parquet(crime_path)

# Create geometry column using ST_Point
crime_df = crime_df.withColumn("geometry", F.expr("ST_Point(LON, LAT)"))

# Step 3: Join the crime data with census data to include 'COMM' (Community) info
crime_with_comm_df = crime_df.alias("cr").join(
    census_df.alias("c"),
    F.expr("ST_Within(cr.geometry, c.geometry)"),
    "inner"
)
# .select(
#     "census_df.COMM"
# )

# Filter crime data for top communities (those with highest income)
top_crime_df = crime_with_comm_df.filter(F.col("COMM").isin(top_community_codes))

# Filter crime data for bottom communities (those with lowest income)
bottom_crime_df = crime_with_comm_df.filter(F.col("COMM").isin(bottom_community_codes))

# Step 4: Load race codes dataset to translate 'Vict Descent'
race_codes_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
race_codes_df = spark.read.csv(race_codes_path, header=True, inferSchema=True)

# Join the crime data with the race codes to get the full description of 'Vict Descent'
top_crime_with_race_df = top_crime_df.join(race_codes_df, top_crime_df["Vict Descent"] == race_codes_df["Vict Descent"], "left_outer")

# Aggregate by 'Vict Descent Full' and count victims for top communities
top_crime_race_count = top_crime_with_race_df.groupBy("Vict Descent Full").agg(
    F.count("*").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)

# Join the crime data with the race codes for bottom communities
bottom_crime_with_race_df = bottom_crime_df.join(race_codes_df, bottom_crime_df["Vict Descent"] == race_codes_df["Vict Descent"], "left_outer")

# Aggregate by 'Vict Descent Full' and count victims for bottom communities
bottom_crime_race_count = bottom_crime_with_race_df.groupBy("Vict Descent Full").agg(
    F.count("*").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)

# Clean up NULL values by adding them to the existing 'Unknown' category
top_crime_race_count_cleaned = top_crime_race_count.groupBy(
    F.when(F.col("Vict Descent Full").isNull(), "Unknown")
    .otherwise(F.col("Vict Descent Full")).alias("Vict Descent")
).agg(
    F.sum("Victim_Count").alias("#")
).orderBy(F.col("#"), ascending=False)

bottom_crime_race_count_cleaned = bottom_crime_race_count.groupBy(
    F.when(F.col("Vict Descent Full").isNull(), "Unknown")
    .otherwise(F.col("Vict Descent Full")).alias("Vict Descent")
).agg(
    F.sum("Victim_Count").alias("#")
).orderBy(F.col("#"), ascending=False)


# You can now show the cleaned-up DataFrames or export them
top_crime_race_count_cleaned.show(truncate=False)
bottom_crime_race_count_cleaned.show(truncate=False)

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------------------+----+
|Vict Descent                  |#   |
+------------------------------+----+
|White                         |5726|
|Unknown                       |836 |
|Other                         |690 |
|Hispanic/Latin/Mexican        |558 |
|Black                         |260 |
|Other Asian                   |204 |
|Chinese                       |29  |
|Japanese                      |20  |
|Filipino                      |16  |
|Korean                        |13  |
|AsianIndian                   |5   |
|American Indian/Alaskan Native|4   |
|Vietnamese                    |3   |
|Hawaiian                      |1   |
+------------------------------+----+

+------------------------------+-----+
|Vict Descent                  |#    |
+------------------------------+-----+
|Hispanic/Latin/Mexican        |19703|
|Black                         |6316 |
|Unknown                       |5077 |
|White                         |1469 |
|Other                         |807  |
|Ot

In [3]:
from pyspark.sql import SparkSession, functions as F
from sedona.register.geo_registrator import SedonaRegistrator
from sedona.spark import SedonaContext
from pyspark.sql.types import StringType
import time

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisSpatialJoinSQL3") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .config("spark.executor.instances", 2)   \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", 4)  \
    .getOrCreate()

    # .config("spark.driver.memory", "16g") \

sedona = SedonaContext.create(spark)
# Start timing
start_time = time.time()
# Load datasets
census_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
crime_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"

# Load Census Data (GeoJSON format)
census_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(census_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

# Flatten the GeoJSON structure and filter valid populations
census_df = census_df.select(
    F.col("properties.ZCTA10").alias("ZCTA10"),
    F.col("properties.POP_2010").alias("Population"),
    F.col("properties.COMM").alias("COMM"),
    F.col("geometry").alias("geometry"),
    F.col("properties.HOUSING10").alias("HOUSING10"),
).filter(F.col("Population") > 0)  # Exclude zero or negative population

# Load Income Data (CSV format)
income_df = spark.read.csv(income_path, header=True, inferSchema=True)
income_df = income_df.withColumn(
    "Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
).withColumnRenamed("Zip Code","Zip").drop("Estimated Median Income")

# Calculate total income per community (sum of income contributions per zip)
income_total = census_df.join(income_df, census_df.ZCTA10 == income_df.Zip, "inner") \
    .groupBy("COMM").agg(
        F.sum(F.col("Income") * F.col("HOUSING10")).alias("Total_Income")
    )

# Aggregate census data by community
census_agg = census_df.groupBy("COMM").agg(
    F.sum("Population").alias("Total_Population"),
    F.sum("HOUSING10").alias("Total_Households")
)

# Join census and income data
census_income = census_agg.join(income_total, "COMM", "inner")

# Calculate Mean Income Per Person
census_income = census_income.withColumn(
    "Mean_Income_Per_Person",
    F.col("Total_Income") / F.col("Total_Population")
)

# Step 1: Get the top 3 and bottom 3 communities by Mean Income Per Person
top_communities = census_income.orderBy(F.col("Mean_Income_Per_Person"), ascending=False).limit(3)
bottom_communities = census_income.orderBy(F.col("Mean_Income_Per_Person"), ascending=True).limit(3)

# Step 2: Filter crime data for top and bottom communities
top_community_codes = [row['COMM'] for row in top_communities.collect()]
bottom_community_codes = [row['COMM'] for row in bottom_communities.collect()]

# Load Crime Data (Parquet format)
crime_df = spark.read.parquet(crime_path)

# Create geometry column using ST_Point
crime_df = crime_df.withColumn("geometry", F.expr("ST_Point(LON, LAT)"))

# Step 3: Join the crime data with census data to include 'COMM' (Community) info
crime_with_comm_df = crime_df.alias("cr").join(
    census_df.alias("c"),
    F.expr("ST_Within(cr.geometry, c.geometry)"),
    "inner"
)
# .select(
#     "census_df.COMM"
# )

# Filter crime data for top communities (those with highest income)
top_crime_df = crime_with_comm_df.filter(F.col("COMM").isin(top_community_codes))

# Filter crime data for bottom communities (those with lowest income)
bottom_crime_df = crime_with_comm_df.filter(F.col("COMM").isin(bottom_community_codes))

# Step 4: Load race codes dataset to translate 'Vict Descent'
race_codes_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
race_codes_df = spark.read.csv(race_codes_path, header=True, inferSchema=True)

# Join the crime data with the race codes to get the full description of 'Vict Descent'
top_crime_with_race_df = top_crime_df.join(race_codes_df, top_crime_df["Vict Descent"] == race_codes_df["Vict Descent"], "left_outer")

# Aggregate by 'Vict Descent Full' and count victims for top communities
top_crime_race_count = top_crime_with_race_df.groupBy("Vict Descent Full").agg(
    F.count("*").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)

# Join the crime data with the race codes for bottom communities
bottom_crime_with_race_df = bottom_crime_df.join(race_codes_df, bottom_crime_df["Vict Descent"] == race_codes_df["Vict Descent"], "left_outer")

# Aggregate by 'Vict Descent Full' and count victims for bottom communities
bottom_crime_race_count = bottom_crime_with_race_df.groupBy("Vict Descent Full").agg(
    F.count("*").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)

# Clean up NULL values by adding them to the existing 'Unknown' category
top_crime_race_count_cleaned = top_crime_race_count.groupBy(
    F.when(F.col("Vict Descent Full").isNull(), "Unknown")
    .otherwise(F.col("Vict Descent Full")).alias("Vict Descent Full")
).agg(
    F.sum("Victim_Count").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)

bottom_crime_race_count_cleaned = bottom_crime_race_count.groupBy(
    F.when(F.col("Vict Descent Full").isNull(), "Unknown")
    .otherwise(F.col("Vict Descent Full")).alias("Vict Descent Full")
).agg(
    F.sum("Victim_Count").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)


# You can now show the cleaned-up DataFrames or export them
top_crime_race_count_cleaned.show(truncate=False)
bottom_crime_race_count_cleaned.show(truncate=False)

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------------------+------------+
|Vict Descent Full             |Victim_Count|
+------------------------------+------------+
|White                         |5726        |
|Unknown                       |836         |
|Other                         |690         |
|Hispanic/Latin/Mexican        |558         |
|Black                         |260         |
|Other Asian                   |204         |
|Chinese                       |29          |
|Japanese                      |20          |
|Filipino                      |16          |
|Korean                        |13          |
|AsianIndian                   |5           |
|American Indian/Alaskan Native|4           |
|Vietnamese                    |3           |
|Hawaiian                      |1           |
+------------------------------+------------+

+------------------------------+------------+
|Vict Descent Full             |Victim_Count|
+------------------------------+------------+
|Hispanic/Latin/Mexican        |1

In [4]:
The observed times for the three different configurations suggest that, as the resources allocated to Spark increase (in terms of CPU cores and memory), the performance of the analysis improves, but the improvement is not always drastic. Let's break down and analyze the results:

### Results Overview:
1. **First Case (1 core / 2 GB memory per executor)**: 
   - **Time Taken**: 101.66 seconds.
   
2. **Second Case (2 cores / 4 GB memory per executor)**: 
   - **Time Taken**: 101.32 seconds.

3. **Third Case (4 cores / 8 GB memory per executor)**: 
   - **Time Taken**: 84.34 seconds.

### Observations:
- **First and Second Cases**: 
  - The time taken between the first and second configurations is almost identical (101.66 seconds vs 101.32 seconds). This suggests that doubling the number of cores and memory had little to no effect on the performance for this workload. It's possible that the task you are running is not highly parallelizable or that it is limited by other factors (such as disk I/O, network latency, or the size of data not being large enough to fully utilize the extra cores and memory).
  
- **Second and Third Cases**: 
  - The third case (4 cores and 8 GB memory) shows a noticeable improvement (84.34 seconds) compared to the second case (101.32 seconds). This suggests that with more cores and memory, Spark can execute more operations in parallel or handle larger intermediate datasets more efficiently. The improvement here indicates that the workload benefits from additional parallelism, but the returns diminish with each increase in resources.
  
### Possible Reasons for These Observations:
1. **Diminishing Returns**: As more cores and memory are allocated, the performance improvements tend to diminish. This is typical for many types of data processing workloads. For small to moderate-sized tasks, you might not see a large speedup when doubling the resources because the overhead of managing these resources (e.g., task scheduling, memory management) may offset some of the potential benefits.
  
2. **Task Characteristics**: 
   - The nature of the task matters. For tasks that are highly parallelizable and require significant computation (e.g., large-scale matrix operations or iterative algorithms), adding more cores and memory could lead to substantial performance gains. However, for tasks that are more dependent on I/O or where the data size is small enough to fit comfortably in the available memory, adding more resources might not show as dramatic an improvement.
  
3. **Data Size**: 
   - If your dataset is not large enough, Spark might not fully utilize the additional cores. In such cases, the time for job execution may be limited by other factors such as network bandwidth, disk read/write speed, or the execution overhead.
  
4. **Job Complexity**: 
   - The job you're running may not have enough computationally intensive stages to benefit from additional cores. For example, if your job spends time waiting for I/O operations or is CPU-bound on a relatively small amount of data, adding more executors might not significantly speed up execution.

### Conclusion:
While the third configuration (4 cores / 8 GB memory) shows the best performance, the results indicate that for your particular workload, the benefits of increasing resources (cores and memory) are relatively modest. In particular, going from 1 core to 2 cores did not show significant improvement, and the performance improvement from 2 cores to 4 cores is noticeable but not drastic.

For further analysis:
- You could experiment with larger datasets or more complex operations to see if the resource allocation truly impacts performance.
- Monitoring the resource utilization during the run (e.g., CPU and memory usage) can provide insights into whether resources are being fully utilized.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
invalid syntax (<stdin>, line 1)
  File "<stdin>", line 1
    The observed times for the three different configurations suggest that, as the resources allocated to Spark increase (in terms of CPU cores and memory), the performance of the analysis improves, but the improvement is not always drastic. Let's break down and analyze the results:
        ^
SyntaxError: invalid syntax

