#### SPARK CONFIGURATION 1

In [21]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1",
        "spark.driver.memory": "2g"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4101,application_1732639283265_4041,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4080,application_1732639283265_4020,pyspark,idle,Link,Link,,
4084,application_1732639283265_4024,pyspark,idle,Link,Link,,
4085,application_1732639283265_4025,pyspark,idle,Link,Link,,
4086,application_1732639283265_4026,pyspark,idle,Link,Link,,
4087,application_1732639283265_4027,pyspark,idle,Link,Link,,
4093,application_1732639283265_4033,pyspark,idle,Link,Link,,
4094,application_1732639283265_4034,pyspark,idle,Link,Link,,
4096,application_1732639283265_4036,pyspark,idle,Link,Link,,
4098,application_1732639283265_4038,pyspark,idle,Link,Link,,
4099,application_1732639283265_4039,pyspark,idle,Link,Link,,


In [22]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 2g
Executor Cores: 1

## QUERY 4: CRIME & RACE DESCRIPTION FOR 2015 - CONFIG 1

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, year, to_timestamp
from pyspark.sql import functions as F
import time

spark = SparkSession.builder \
    .appName("Crime Data Analysis with Race and Region") \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load Crime Data
crime_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_df = spark.read.csv(crime_data_path, header=True, inferSchema=True)

crime_df = crime_df.select(
    col("LAT").alias("latitude"),
    col("LON").alias("longitude"),
    col("Vict Descent").alias("vict_descent"),
    col("DATE OCC").alias("date_occ")  # Επιλογή της στήλης DATE OCC για φιλτράρισμα
)

# Convert DATE OCC to Timestamp type
crime_df = crime_df.withColumn("date_occ", to_timestamp(col("date_occ"), "MM/dd/yyyy hh:mm:ss a"))

# Filter for 2015
crime_df = crime_df.filter(year(col("date_occ")) == 2015)


crime_df = crime_df.withColumn("geom", expr("ST_Point(longitude, latitude)"))

# Vict Descent CSV load
csv_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
vict_desc_df = spark.read.csv(csv_path, header=True, inferSchema=True)

vict_desc_df = vict_desc_df.select(
    col("Vict Descent").alias("vict_descent_code"),
    col("Vict Descent Full").alias("vict_descent_full")
)

# Load GeoJSON and group for Areas
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = spark.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

flattened_df = blocks_df.select(
    [col(f"properties.{col_name}").alias(col_name) for col_name in blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]
).drop("properties").drop("type")

# Group (group_comm_1 και group_comm_2)
group_comm_1_areas = ["Malibu", "Manhattan Beach", "Hermosa Beach"]
group_comm_2_areas = ["East Los Angeles", "Lynwood", "Boyle Heights"]

grouped_df = flattened_df.withColumn(
    "group_comm",
    when(col("COMM").isin(group_comm_1_areas), "group_comm_1")
    .when(col("COMM").isin(group_comm_2_areas), "group_comm_2")
    .otherwise("other")
)

#  group_comm and geodata Join
grouped_blocks_df = grouped_df.filter(col("group_comm").isin("group_comm_1", "group_comm_2")) \
    .groupBy("group_comm") \
    .agg(
        expr("ST_Union_Aggr(geometry)").alias("unified_geometry")
    )

# join  crime_df and grouped_blocks_df
join_condition = expr("ST_Within(geom, unified_geometry)")
joined_df = crime_df.join(grouped_blocks_df, join_condition, "inner")

# Join RE_codes for "Vict Descent"
final_df = joined_df.join(vict_desc_df, joined_df.vict_descent == vict_desc_df.vict_descent_code, "inner")

# Group and count group_comm και Vict Descent
crime_counts = final_df.groupBy("group_comm", "vict_descent_full") \
    .count() \
    .orderBy("count", ascending=False)

# DataFrames for every group_comm
group_comm_1_df = crime_counts.filter(col("group_comm") == "group_comm_1")
group_comm_2_df = crime_counts.filter(col("group_comm") == "group_comm_2")

# End timing
end_time = time.time()
print(f"Execution time: {round(end_time - start_time, 3)} seconds")

# Prints
print("QUERY 4 CONFIG_1 (1core/2 GB)")

print("Race Description of Crimes committed in the LA Areas with the Highest Median Income per Person (2015):")
group_comm_1_df.drop("group_comm").show(truncate=False)

print("Race Description of Crimes committed in the LA Areas with the Lowest Median Income per Person (2015):")
group_comm_2_df.drop("group_comm").show(truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Execution time: 46.657 seconds
QUERY 4 CONFIG_1 (1core/2 GB)
Race Description of Crimes committed in the LA Areas with the Highest Median Income per Person (2015):
+-----------------+-----+
|vict_descent_full|count|
+-----------------+-----+
+-----------------+-----+

Race Description of Crimes committed in the LA Areas with the Lowest Median Income per Person (2015):
+----------------------+-----+
|vict_descent_full     |count|
+----------------------+-----+
|Hispanic/Latin/Mexican|2878 |
|Unknown               |254  |
|White                 |212  |
|Black                 |123  |
|Other                 |70   |
|Other Asian           |32   |
|Chinese               |3    |
|Filipino              |2    |
|Pacific Islander      |2    |
+----------------------+-----+

#### SPARK CONFIGURATION 2

In [25]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2",
        "spark.driver.memory": "2g"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4102,application_1732639283265_4042,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4080,application_1732639283265_4020,pyspark,idle,Link,Link,,
4084,application_1732639283265_4024,pyspark,idle,Link,Link,,
4085,application_1732639283265_4025,pyspark,idle,Link,Link,,
4086,application_1732639283265_4026,pyspark,idle,Link,Link,,
4087,application_1732639283265_4027,pyspark,idle,Link,Link,,
4093,application_1732639283265_4033,pyspark,idle,Link,Link,,
4094,application_1732639283265_4034,pyspark,idle,Link,Link,,
4096,application_1732639283265_4036,pyspark,idle,Link,Link,,
4098,application_1732639283265_4038,pyspark,idle,Link,Link,,
4099,application_1732639283265_4039,pyspark,idle,Link,Link,,


In [26]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 4g
Executor Cores: 2

## QUERY 4: CRIME & RACE DESCRIPTION FOR 2015 - CONFIG 2

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, year, to_timestamp
from pyspark.sql import functions as F
import time

spark = SparkSession.builder \
    .appName("Crime Data Analysis with Race and Region") \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load Crime Data
crime_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_df = spark.read.csv(crime_data_path, header=True, inferSchema=True)

crime_df = crime_df.select(
    col("LAT").alias("latitude"),
    col("LON").alias("longitude"),
    col("Vict Descent").alias("vict_descent"),
    col("DATE OCC").alias("date_occ")  # Επιλογή της στήλης DATE OCC για φιλτράρισμα
)

# Convert DATE OCC to Timestamp type
crime_df = crime_df.withColumn("date_occ", to_timestamp(col("date_occ"), "MM/dd/yyyy hh:mm:ss a"))

# Filter for 2015
crime_df = crime_df.filter(year(col("date_occ")) == 2015)


crime_df = crime_df.withColumn("geom", expr("ST_Point(longitude, latitude)"))

# Vict Descent CSV load
csv_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
vict_desc_df = spark.read.csv(csv_path, header=True, inferSchema=True)

vict_desc_df = vict_desc_df.select(
    col("Vict Descent").alias("vict_descent_code"),
    col("Vict Descent Full").alias("vict_descent_full")
)

# Load GeoJSON and group for Areas
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = spark.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

flattened_df = blocks_df.select(
    [col(f"properties.{col_name}").alias(col_name) for col_name in blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]
).drop("properties").drop("type")

# Group (group_comm_1 και group_comm_2)
group_comm_1_areas = ["Malibu", "Manhattan Beach", "Hermosa Beach"]
group_comm_2_areas = ["East Los Angeles", "Lynwood", "Boyle Heights"]

grouped_df = flattened_df.withColumn(
    "group_comm",
    when(col("COMM").isin(group_comm_1_areas), "group_comm_1")
    .when(col("COMM").isin(group_comm_2_areas), "group_comm_2")
    .otherwise("other")
)

#  group_comm and geodata Join
grouped_blocks_df = grouped_df.filter(col("group_comm").isin("group_comm_1", "group_comm_2")) \
    .groupBy("group_comm") \
    .agg(
        expr("ST_Union_Aggr(geometry)").alias("unified_geometry")
    )

# join  crime_df and grouped_blocks_df
join_condition = expr("ST_Within(geom, unified_geometry)")
joined_df = crime_df.join(grouped_blocks_df, join_condition, "inner")

# Join RE_codes for "Vict Descent"
final_df = joined_df.join(vict_desc_df, joined_df.vict_descent == vict_desc_df.vict_descent_code, "inner")

# Group and count group_comm και Vict Descent
crime_counts = final_df.groupBy("group_comm", "vict_descent_full") \
    .count() \
    .orderBy("count", ascending=False)

# DataFrames for every group_comm
group_comm_1_df = crime_counts.filter(col("group_comm") == "group_comm_1")
group_comm_2_df = crime_counts.filter(col("group_comm") == "group_comm_2")

# End timing
end_time = time.time()
print(f"Execution time: {round(end_time - start_time, 3)} seconds")

# Prints
print("QUERY 4 CONFIG_2 (2core/2 GB)")

print("Race Description of Crimes committed in the LA Areas with the Highest Median Income per Person (2015):")
group_comm_1_df.drop("group_comm").show(truncate=False)

print("Race Description of Crimes committed in the LA Areas with the Lowest Median Income per Person (2015):")
group_comm_2_df.drop("group_comm").show(truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Execution time: 18.875 seconds
QUERY 4 CONFIG_2 (2core/2 GB)
Race Description of Crimes committed in the LA Areas with the Highest Median Income per Person (2015):
+-----------------+-----+
|vict_descent_full|count|
+-----------------+-----+
+-----------------+-----+

Race Description of Crimes committed in the LA Areas with the Lowest Median Income per Person (2015):
+----------------------+-----+
|vict_descent_full     |count|
+----------------------+-----+
|Hispanic/Latin/Mexican|2878 |
|Unknown               |254  |
|White                 |212  |
|Black                 |123  |
|Other                 |70   |
|Other Asian           |32   |
|Chinese               |3    |
|Pacific Islander      |2    |
|Filipino              |2    |
+----------------------+-----+

#### SPARK CONFIGURATION 3

In [28]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4",
        "spark.driver.memory": "2g"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4104,application_1732639283265_4044,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4080,application_1732639283265_4020,pyspark,idle,Link,Link,,
4084,application_1732639283265_4024,pyspark,idle,Link,Link,,
4085,application_1732639283265_4025,pyspark,idle,Link,Link,,
4086,application_1732639283265_4026,pyspark,idle,Link,Link,,
4087,application_1732639283265_4027,pyspark,idle,Link,Link,,
4093,application_1732639283265_4033,pyspark,idle,Link,Link,,
4094,application_1732639283265_4034,pyspark,idle,Link,Link,,
4096,application_1732639283265_4036,pyspark,idle,Link,Link,,
4099,application_1732639283265_4039,pyspark,idle,Link,Link,,
4103,application_1732639283265_4043,pyspark,busy,Link,Link,,


In [29]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 8g
Executor Cores: 4

## QUERY 4: CRIME & RACE DESCRIPTION FOR 2015 - CONFIG 3

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, year, to_timestamp
from pyspark.sql import functions as F
import time

spark = SparkSession.builder \
    .appName("Crime Data Analysis with Race and Region") \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load Crime Data
crime_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_df = spark.read.csv(crime_data_path, header=True, inferSchema=True)

crime_df = crime_df.select(
    col("LAT").alias("latitude"),
    col("LON").alias("longitude"),
    col("Vict Descent").alias("vict_descent"),
    col("DATE OCC").alias("date_occ")  # Επιλογή της στήλης DATE OCC για φιλτράρισμα
)

# Convert DATE OCC to Timestamp type
crime_df = crime_df.withColumn("date_occ", to_timestamp(col("date_occ"), "MM/dd/yyyy hh:mm:ss a"))

# Filter for 2015
crime_df = crime_df.filter(year(col("date_occ")) == 2015)


crime_df = crime_df.withColumn("geom", expr("ST_Point(longitude, latitude)"))

# Vict Descent CSV load
csv_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
vict_desc_df = spark.read.csv(csv_path, header=True, inferSchema=True)

vict_desc_df = vict_desc_df.select(
    col("Vict Descent").alias("vict_descent_code"),
    col("Vict Descent Full").alias("vict_descent_full")
)

# Load GeoJSON and group for Areas
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = spark.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

flattened_df = blocks_df.select(
    [col(f"properties.{col_name}").alias(col_name) for col_name in blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]
).drop("properties").drop("type")

# Group (group_comm_1 και group_comm_2)
group_comm_1_areas = ["Malibu", "Manhattan Beach", "Hermosa Beach"]
group_comm_2_areas = ["East Los Angeles", "Lynwood", "Boyle Heights"]

grouped_df = flattened_df.withColumn(
    "group_comm",
    when(col("COMM").isin(group_comm_1_areas), "group_comm_1")
    .when(col("COMM").isin(group_comm_2_areas), "group_comm_2")
    .otherwise("other")
)

#  group_comm and geodata Join
grouped_blocks_df = grouped_df.filter(col("group_comm").isin("group_comm_1", "group_comm_2")) \
    .groupBy("group_comm") \
    .agg(
        expr("ST_Union_Aggr(geometry)").alias("unified_geometry")
    )

# join  crime_df and grouped_blocks_df
join_condition = expr("ST_Within(geom, unified_geometry)")
joined_df = crime_df.join(grouped_blocks_df, join_condition, "inner")

# Join RE_codes for "Vict Descent"
final_df = joined_df.join(vict_desc_df, joined_df.vict_descent == vict_desc_df.vict_descent_code, "inner")

# Group and count group_comm και Vict Descent
crime_counts = final_df.groupBy("group_comm", "vict_descent_full") \
    .count() \
    .orderBy("count", ascending=False)

# DataFrames for every group_comm
group_comm_1_df = crime_counts.filter(col("group_comm") == "group_comm_1")
group_comm_2_df = crime_counts.filter(col("group_comm") == "group_comm_2")

# End timing
end_time = time.time()
print(f"Execution time: {round(end_time - start_time, 3)} seconds")

# Prints
print("QUERY 4 CONFIG_3 (4core/8 GB)")

print("Race Description of Crimes committed in the LA Areas with the Highest Median Income per Person (2015):")
group_comm_1_df.drop("group_comm").show(truncate=False)

print("Race Description of Crimes committed in the LA Areas with the Lowest Median Income per Person (2015):")
group_comm_2_df.drop("group_comm").show(truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Execution time: 5.196 seconds
QUERY 4 CONFIG_3 (4core/8 GB)
Race Description of Crimes committed in the LA Areas with the Highest Median Income per Person (2015):
+-----------------+-----+
|vict_descent_full|count|
+-----------------+-----+
+-----------------+-----+

Race Description of Crimes committed in the LA Areas with the Lowest Median Income per Person (2015):
+----------------------+-----+
|vict_descent_full     |count|
+----------------------+-----+
|Hispanic/Latin/Mexican|2878 |
|Unknown               |254  |
|White                 |212  |
|Black                 |123  |
|Other                 |70   |
|Other Asian           |32   |
|Chinese               |3    |
|Filipino              |2    |
|Pacific Islander      |2    |
+----------------------+-----+