# Query 1

## Load and describe data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
import time

# Create SparkSession
spark = SparkSession.builder \
    .appName("Query1") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load Data
crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True)
data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)

# View first 10 lines of first dataset (for validation)
data_1.limit(3).show()

# Describe the datasets
data_1.describe().show()    
data_2.describe().show() 

# Print schemas
data_1.printSchema()
data_2.printSchema()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3059,application_1732639283265_3017,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--------+------------+---------+--------------------+--------------+-----------+------+------------+--------+--------+--------+--------+--------------------+--------------------+-------+---------+
|   DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA |AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|       Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|Weapon Desc|Status| Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|        Cross Street|    LAT|      LON|
+--------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--------+------------+---------+--------------------+--------------+-----------+------+------------+--------+--------+--------+--------+--------------------

## Query - DataFrame API

In [2]:
# Start timing
start_time = time.time()

# Load Data
crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True)
data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)

# Concatenate datasets
data = data_1.union(data_2)

# Filter data for "aggravated assault"
start_time2 = time.time()

aggravated_assault_data = data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

end_time2 = time.time()
elapsed_time2 = end_time2 - start_time2
print(f"Time taken for finding all Ag. Assault Rows (DF): {elapsed_time2:.4f} seconds")

# Group by age
start_time2 = time.time()

categorized_data = aggravated_assault_data.withColumn(
    "Age Group",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Elderly")
)
age_group_counts = categorized_data.groupBy("Age Group").agg(count("Age Group").alias("Count"))

end_time2 = time.time()
elapsed_time2 = end_time2 - start_time2
print(f"Time taken for Grouping (DF): {elapsed_time2:.4f} seconds")

# Sort in decending order
sorted_age_groups = age_group_counts.orderBy(col("Count").desc())

# Show results
sorted_age_groups.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.4f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken for finding all Ag. Assault Rows (DF): 0.0275 seconds
Time taken for Grouping (DF): 0.0629 seconds
+------------+------+
|   Age Group| Count|
+------------+------+
|      Adults|121093|
|Young Adults| 33605|
|    Children| 15928|
|     Elderly|  5985|
+------------+------+

Time taken: 5.0901 seconds

## Query - RDD API

In [3]:
# Start timing
start_time = time.time()

# Load data
crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True)
data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)

# Concatenate datasets
data = data_1.union(data_2)

# Convert to RDD
start_time2 = time.time()

rdd = data.rdd

end_time2 = time.time()
elapsed_time2 = end_time2 - start_time2
print(f"Time taken for conversion to RDD: {elapsed_time2:.4f} seconds")

# Filter for "aggravated assault"
start_time2 = time.time()

aggravated_assault_rdd = rdd.filter(lambda row: "AGGRAVATED ASSAULT" in row["Crm Cd Desc"])

end_time2 = time.time()
elapsed_time2 = end_time2 - start_time2
print(f"Time taken for finding all Ag. Assault Rows (RDD): {elapsed_time2:.4f} seconds")

# Categorize to age groups
start_time2 = time.time()

categorized_rdd = aggravated_assault_rdd.map(
    lambda row: (
        "Children" if row["Vict Age"] < 18 else
        "Young Adults" if 18 <= row["Vict Age"] <= 24 else
        "Adults" if 25 <= row["Vict Age"] <= 64 else
        "Elderly",
        1
    )
)

# Group and count
age_group_counts_rdd = categorized_rdd.reduceByKey(lambda a, b: a + b)

end_time2 = time.time()
elapsed_time2 = end_time2 - start_time2
print(f"Time taken for Grouping (RDD): {elapsed_time2:.4f} seconds")

# Sort by descending order
sorted_age_groups_rdd = age_group_counts_rdd.sortBy(lambda x: x[1], ascending=False)

# Show results
for group, count in sorted_age_groups_rdd.collect():
    print(f"{group}: {count}")

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.4f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken for conversion to RDD: 0.0715 seconds
Time taken for finding all Ag. Assault Rows (RDD): 0.0008 seconds
Time taken for Grouping (RDD): 0.0403 seconds
Adults: 121093
Young Adults: 33605
Children: 15928
Elderly: 5985
Time taken: 8.3549 seconds

In [4]:
#spark.stop();

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…