In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count

# Paths for the two CSV files
csv_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
csv_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"


# Start Spark session
spark = SparkSession.builder \
    .appName("Query 1") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1864,application_1732639283265_1825,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
#DATAFRAME
start_time = time.time()

# Load the first CSV file
csv_df_1 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csv_path_1)

# Load the second CSV file
csv_df_2 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csv_path_2)

# Combine the two DataFrames
combined_df = csv_df_1.union(csv_df_2)

combined_df_1 = combined_df.filter(~((col('LAT') == 0) & (col('LON') == 0)))

# Filter for aggravated assault crimes
aggravated_assault_df = combined_df_1.filter(combined_df["Crm Cd Desc"].rlike("(?i).*aggravated assault*"))

# Add age group column
age_grouped_df = aggravated_assault_df.withColumn(
    "Age Group",
    when((col("Vict Age") > 0) & (col("Vict Age") < 18), "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Elderly")
    .otherwise("Unknown")
)

age_group_counts_df_1 = age_grouped_df.filter(col("Age Group") != "Unknown")

# Group by age group and count incidents
age_group_counts_df_2 = age_group_counts_df_1.groupBy("Age Group").agg(count("*").alias("Incident Count"))

# Sort by incident count
sorted_age_groups_df = age_group_counts_df_2.orderBy(col("Incident Count").desc())
end_time = time.time()
df_execution_time = end_time - start_time

# Show results
sorted_age_groups_df.show()

# Measure execution time
print(f"DataFrame API execution time: {df_execution_time} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+--------------+
|   Age Group|Incident Count|
+------------+--------------+
|      Adults|        121052|
|Young Adults|         33588|
|    Children|         10825|
|     Elderly|          5985|
+------------+--------------+

DataFrame API execution time: 29.42403221130371 seconds

In [3]:
#RDD
start_time = time.time()

# Load both CSV files into RDDs
csv_rdd_1 = spark.read.csv(csv_path_1, header=True, inferSchema=True).rdd
csv_rdd_2 = spark.read.csv(csv_path_2, header=True, inferSchema=True).rdd

# Combine the RDDs
combined_rdd = csv_rdd_1.union(csv_rdd_2)

# Filter for aggravated assault crimes
filtered_rdd = combined_rdd.filter(lambda row: not (row['LAT'] == 0 and row['LON'] == 0))

filtered_rdd_1 = filtered_rdd.filter(lambda row: 'aggravated assault' in (row['Crm Cd Desc'] or "").lower())

# Map age groups
age_grouped_rdd = filtered_rdd_1.map(lambda row: (
    "Children" if 1 <= row['Vict Age'] < 18 else
    "Young Adults" if 18 <= row['Vict Age'] <= 24 else
    "Adults" if 25 <= row['Vict Age'] <= 64 else
    "Elderly" if row['Vict Age'] >= 65 else
    "Unknown", 1  # Handle any invalid or missing ages as "Unknown"
))

age_grouped_rdd_1 = age_grouped_rdd.filter(lambda row: row[0] != "Unknown")

# Reduce by key to get counts
age_group_counts_rdd = age_grouped_rdd_1.reduceByKey(lambda x, y: x + y)

# Sort by incident count
sorted_age_groups_rdd = age_group_counts_rdd.sortBy(lambda x: x[1], ascending=False)

# Collect results
results = sorted_age_groups_rdd.collect()

# End timing
end_time = time.time()
rdd_execution_time = end_time - start_time

# Print results
for age_group, count in results:
    print(f"{age_group}: {count} incidents")

print(f"RDD API execution time: {rdd_execution_time} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Adults: 121052 incidents
Young Adults: 33588 incidents
Children: 10825 incidents
Elderly: 5985 incidents
RDD API execution time: 38.487404346466064 seconds

In [26]:
from pyspark.sql.functions import col, sum, count, row_number, desc,  year, lower, to_date 
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import time

# Step 2: Load Datasets
stations_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"
crime_2010_2019_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_2020_present_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
start_time = time.time()

# Load Police Stations Data
stations_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(stations_path)

# Load Crime Data (2010-2019)
crime_2010_2019_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(crime_2010_2019_path)

# Load Crime Data (2020-Present)
crime_2020_present_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(crime_2020_present_path)

# Step 3: Combine Crime Datasets
combined_crime_df_1 = crime_2010_2019_df.union(crime_2020_present_df)

combined_crime_df = combined_crime_df_1.filter(~((col('LAT') == 0) & (col('LON') == 0)))

result_df = stations_df.join(
    combined_crime_df,
    lower(stations_df['DIVISION']) == lower(combined_crime_df['AREA NAME']),
    'inner'
)

# Add a Year column and filter only resolved cases
df_with_year = result_df.withColumn("Year", year(to_date(col("DATE OCC"), "MM/dd/yyyy")))

df_with_year_resolved = df_with_year.filter((col("Status Desc") != "UNK") & (col("Status Desc") != "Invest Cont"))

# Step 3: Calculate Total and Closed Cases per Year and Division
total_cases_df = df_with_year.groupBy("Year", "AREA NAME").agg(
    F.count("*").alias("TotalCases")
)

closed_cases_df = df_with_year_resolved.groupBy("Year", "AREA NAME").agg(
    F.count("*").alias("ClosedCases")
)

# Step 4: Join the Total Cases and Closed Cases DataFrames
cases_df = total_cases_df.join(closed_cases_df, ["Year", "AREA NAME"], "left")



# Step 5: Calculate the Closed Case Percentage
cases_df = cases_df.withColumn(
    "ClosedCasePercentage", 
    (F.col("ClosedCases") / F.col("TotalCases")) * 100
)

# Step 6: Rank divisions within each year based on Closed Case Percentage
window_spec = Window.partitionBy("Year").orderBy(F.col("ClosedCasePercentage").desc())
ranked_df = cases_df.withColumn("Rank", F.row_number().over(window_spec))

# Step 7: Filter top 3 divisions for each year
top_3_df = ranked_df.filter(F.col("Rank") <= 3)

# Step 8: Order by year and rank
result_df_1 = top_3_df.orderBy("Year", "Rank")

result_df = result_df_1.select("Year", "AREA NAME", "ClosedCasePercentage", "Rank")


end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution Time: {elapsed_time} seconds")


# Show the results
result_df.show(70, truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Execution Time: 15.308379173278809 seconds
+----+-----------+--------------------+----+
|Year|AREA NAME  |ClosedCasePercentage|Rank|
+----+-----------+--------------------+----+
|2010|Rampart    |32.85090742017      |1   |
|2010|Olympic    |31.515289821999087  |2   |
|2010|Harbor     |29.36028339237341   |3   |
|2011|Olympic    |35.03192688118192   |1   |
|2011|Rampart    |32.500296103280824  |2   |
|2011|Harbor     |28.516260162601625  |3   |
|2012|Olympic    |34.295435879385195  |1   |
|2012|Rampart    |32.461037450569904  |2   |
|2012|Harbor     |29.534834324553948  |3   |
|2013|Olympic    |33.58217940999398   |1   |
|2013|Rampart    |32.1060382916053    |2   |
|2013|Harbor     |29.70696405267529   |3   |
|2014|Van Nuys   |32.00295639320029   |1   |
|2014|West Valley|31.512710797885727  |2   |
|2014|Mission    |31.21740874448456   |3   |
|2015|Van Nuys   |32.265140677157845  |1   |
|2015|Mission    |30.466622852314334  |2   |
|2015|Foothill   |30.353001803658852  |3   |
|2016|Van Nu

In [29]:
start_time = time.time()

# Load Police Stations Data
stations_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(stations_path)

# Load Crime Data (2010-2019)
crime_2010_2019_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(crime_2010_2019_path)

# Load Crime Data (2020-Present)
crime_2020_present_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(crime_2020_present_path)

# Step 3: Combine Crime Datasets
combined_crime_df_1 = crime_2010_2019_df.union(crime_2020_present_df)

combined_crime_df = combined_crime_df_1.filter(~((col('LAT') == 0) & (col('LON') == 0)))

result_df = stations_df.join(
    combined_crime_df,
    lower(stations_df['DIVISION']) == lower(combined_crime_df['AREA NAME']),
    'inner'
)


stations_df.createOrReplaceTempView("stations")
combined_crime_df.createOrReplaceTempView("combined_crime")

# Step 2: Run the SQL query
query = """
WITH ResolvedCases AS (
    SELECT 
        s.DIVISION, 
        c.*, 
        CASE 
            WHEN c.`Status Desc` NOT IN ('UNK', 'Invest Cont') THEN 1 
            ELSE 0 
        END AS is_resolved,
        YEAR(TO_DATE(c.`DATE OCC`, 'MM/dd/yyyy')) AS Year
    FROM 
        stations s
    INNER JOIN 
        combined_crime c
    ON 
        LOWER(s.DIVISION) = LOWER(c.`AREA NAME`)
),
TotalCases AS (
    SELECT 
        Year, 
        `AREA NAME` AS Division, 
        COUNT(*) AS TotalCases
    FROM 
        ResolvedCases
    GROUP BY 
        Year, `AREA NAME`
),
ClosedCases AS (
    SELECT 
        Year, 
        `AREA NAME` AS Division, 
        COUNT(*) AS ClosedCases
    FROM 
        ResolvedCases
    WHERE 
        is_resolved = 1
    GROUP BY 
        Year, `AREA NAME`
),
CasesWithPercentage AS (
    SELECT 
        t.Year, 
        t.Division, 
        t.TotalCases, 
        COALESCE(c.ClosedCases, 0) AS ClosedCases, 
        (COALESCE(c.ClosedCases, 0) / t.TotalCases) * 100 AS ClosedCasePercentage
    FROM 
        TotalCases t
    LEFT JOIN 
        ClosedCases c
    ON 
        t.Year = c.Year AND t.Division = c.Division
),
RankedDivisions AS (
    SELECT 
        Year, 
        Division, 
        TotalCases, 
        ClosedCases, 
        ClosedCasePercentage,
        ROW_NUMBER() OVER (PARTITION BY Year ORDER BY ClosedCasePercentage DESC) AS Rank
    FROM 
        CasesWithPercentage
)
SELECT 
    Year, 
    Division, 
    ClosedCasePercentage,
    Rank
FROM 
    RankedDivisions
WHERE 
    Rank <= 3
ORDER BY 
    Year, Rank
"""
# Execute the query
result_df = spark.sql(query)

# End timing
end_time = time.time()

# Calculate the elapsed time
execution_time = end_time - start_time

# Show the results
result_df.show(70, truncate=False)

# Print the execution time
print(f"Query execution time: {execution_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+--------------------+----+
|Year|Division   |ClosedCasePercentage|Rank|
+----+-----------+--------------------+----+
|2010|Rampart    |32.85090742017      |1   |
|2010|Olympic    |31.515289821999087  |2   |
|2010|Harbor     |29.36028339237341   |3   |
|2011|Olympic    |35.03192688118192   |1   |
|2011|Rampart    |32.500296103280824  |2   |
|2011|Harbor     |28.516260162601625  |3   |
|2012|Olympic    |34.295435879385195  |1   |
|2012|Rampart    |32.461037450569904  |2   |
|2012|Harbor     |29.534834324553948  |3   |
|2013|Olympic    |33.58217940999398   |1   |
|2013|Rampart    |32.1060382916053    |2   |
|2013|Harbor     |29.70696405267529   |3   |
|2014|Van Nuys   |32.00295639320029   |1   |
|2014|West Valley|31.512710797885727  |2   |
|2014|Mission    |31.21740874448456   |3   |
|2015|Van Nuys   |32.265140677157845  |1   |
|2015|Mission    |30.466622852314334  |2   |
|2015|Foothill   |30.353001803658852  |3   |
|2016|Van Nuys   |32.1880650994575    |1   |
|2016|West

In [27]:
# Save the combined dataset as Parquet to S3
parquet_path = "s3://groups-bucket-dblab-905418150721/group16/combined_crime.parquet"

combined_crime_df_1.write.mode("overwrite").parquet(parquet_path)

# Measure execution time for reading from Parquet
start_parquet_execution = time.time()

# Read the Parquet file
parquet_df_1 = spark.read.format("parquet").load(parquet_path)

parquet_df  = parquet_df_1.filter(~((col('LAT') == 0) & (col('LON') == 0)))

result_df_from_parquet = stations_df.join(
    parquet_df,
    lower(stations_df['DIVISION']) == lower(parquet_df['AREA NAME']),
    'inner'
)

df_with_year_parquet = result_df_from_parquet.withColumn("Year", year(to_date(col("DATE OCC"), "MM/dd/yyyy")))

df_with_year_resolved_parquet = df_with_year_parquet.filter(
    (col("Status Desc") != "UNK") & (col("Status Desc") != "Invest Cont")
)

total_cases_parquet = df_with_year_parquet.groupBy("Year", "AREA NAME").agg(
    F.count("*").alias("TotalCases")
)

closed_cases_parquet = df_with_year_resolved_parquet.groupBy("Year", "AREA NAME").agg(
    F.count("*").alias("ClosedCases")
)

cases_parquet = total_cases_parquet.join(closed_cases_parquet, ["Year", "AREA NAME"], "left").withColumn(
    "ClosedCasePercentage", 
    (F.col("ClosedCases") / F.col("TotalCases")) * 100
)

ranked_parquet = cases_parquet.withColumn(
    "Rank", F.row_number().over(Window.partitionBy("Year").orderBy(F.col("ClosedCasePercentage").desc()))
)

top_3_parquet = ranked_parquet.filter(F.col("Rank") <= 3).orderBy("Year", "Rank")

result_parquet = top_3_parquet.select("Year", "AREA NAME", "ClosedCasePercentage", "Rank")

end_parquet_execution = time.time()
parquet_execution_time = end_parquet_execution - start_parquet_execution

print(f"Execution Time (Parquet): {parquet_execution_time:.2f} seconds")

result_parquet.show(70, truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Execution Time (Parquet): 0.36 seconds
+----+-----------+--------------------+----+
|Year|AREA NAME  |ClosedCasePercentage|Rank|
+----+-----------+--------------------+----+
|2010|Rampart    |32.85090742017      |1   |
|2010|Olympic    |31.515289821999087  |2   |
|2010|Harbor     |29.36028339237341   |3   |
|2011|Olympic    |35.03192688118192   |1   |
|2011|Rampart    |32.500296103280824  |2   |
|2011|Harbor     |28.516260162601625  |3   |
|2012|Olympic    |34.295435879385195  |1   |
|2012|Rampart    |32.461037450569904  |2   |
|2012|Harbor     |29.534834324553948  |3   |
|2013|Olympic    |33.58217940999398   |1   |
|2013|Rampart    |32.1060382916053    |2   |
|2013|Harbor     |29.70696405267529   |3   |
|2014|Van Nuys   |32.00295639320029   |1   |
|2014|West Valley|31.512710797885727  |2   |
|2014|Mission    |31.21740874448456   |3   |
|2015|Van Nuys   |32.265140677157845  |1   |
|2015|Mission    |30.466622852314334  |2   |
|2015|Foothill   |30.353001803658852  |3   |
|2016|Van Nuys  