In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
161,application_1764662801237_0164,pyspark,idle,Link,Link,,
164,application_1764662801237_0167,pyspark,idle,Link,Link,,
165,application_1764662801237_0168,pyspark,idle,Link,Link,,
170,application_1764662801237_0173,pyspark,idle,Link,Link,,


In [2]:
import io
import csv
import time

from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
from pyspark.sql.functions import col, when, count, desc, split, explode

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
174,application_1764662801237_0177,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
CRIMES_PATH_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
CRIMES_PATH_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
MO_CODES_PATH = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
spark = SparkSession.builder.appName("Query 3").getOrCreate()

def load_crime_data(path):
    data = spark.read.csv(path, header=True)
    df = data.select(
        col("Mocodes").alias("mocodes")
    )
    return df

crimes1 = load_crime_data(CRIMES_PATH_1)
crimes2 = load_crime_data(CRIMES_PATH_2)

# Concatenate the two datasets
crimes = crimes1.union(crimes2)

#crimes.printSchema()
#print(crimes.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
mo_codes_raw = spark.read.text(MO_CODES_PATH)

#mo_codes_raw.show(n=10)

mo_codes_df = mo_codes_raw \
    .withColumn("split", split(col("value"), " ", 2)) \
    .select(
        col("split").getItem(0).alias("mo_code"),
        col("split").getItem(1).alias("description")
    )

#mo_codes_df.show(n=10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
df_start = time.time()

codes = crimes.filter(col("mocodes").isNotNull()) \
        .withColumn("mo_code", explode(split(col("Mocodes"), " "))) \
        .groupBy("mo_code") \
        .agg(count("*").alias("frequency"))
#codes.show(n=10)

result_df = codes.join(mo_codes_df.hint("SHUFFLE_REPLICATE_NL"), on="mo_code", how="inner") \
                 .orderBy(desc("frequency"))

result_df.explain()

# We possibly have a small overhead from printing 50 rows, but it's okay
result_df.show(n=50, truncate=False)

df_end = time.time()
df_time = df_end - df_start

print(f"DataFrame execution time = {df_time}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [frequency#271L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(frequency#271L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=288]
      +- Project [mo_code#265, frequency#271L, description#173]
         +- CartesianProduct (mo_code#265 = mo_code#172)
            :- HashAggregate(keys=[mo_code#265], functions=[count(1)], schema specialized)
            :  +- Exchange hashpartitioning(mo_code#265, 1000), ENSURE_REQUIREMENTS, [plan_id=283]
            :     +- HashAggregate(keys=[mo_code#265], functions=[partial_count(1)], schema specialized)
            :        +- Generate explode(split(Mocodes#85,  , -1)), false, [mo_code#265]
            :           +- Union
            :              :- Project [Mocodes#39 AS mocodes#85]
            :              :  +- Filter isnotnull(Mocodes#39)
            :              :     +- FileScan csv [Mocodes#39] Batched: false, DataFilters: [isnotnull(Mocodes#39)], Format

In [7]:
# Query 3 with RDD API

sc = spark.sparkContext

crimes1 = sc.textFile(CRIMES_PATH_1)
crimes2 = sc.textFile(CRIMES_PATH_2)
header = crimes1.first()

crimes_raw = crimes1.union(crimes2).filter(lambda line: line != header)

def parse_mo_desc(line):
    parts = line.split(" ", 1)
    if len(parts) == 2:
        return (parts[0], parts[1])
    return None
        
mo_desc_rdd = sc.textFile(MO_CODES_PATH) \
                .map(parse_mo_desc) \
                .filter(lambda x: x is not None)

MO_CODE_INDEX = 10
def parse_mocodes(line):
    try:
        reader = csv.reader(io.StringIO(line))
        row = next(reader)

        if len(row) > MO_CODE_INDEX and row[MO_CODE_INDEX]:
            codes = row[MO_CODE_INDEX].split(" ")
            return codes
    except:
        return []
    return []

rdd_start = time.time()
crimes_parsed = crimes_raw.flatMap(parse_mocodes)
codes_count = crimes_parsed \
              .map(lambda x: (x, 1)) \
              .reduceByKey(lambda x,y: x+y)

#print(codes_count.take(10))

# codes_count: (code, count)
# mo_desc_rdd: (code, description)
# join    ---> (code, (count, description))

result_rdd = codes_count.join(mo_desc_rdd) \
                        .map(lambda x: (x[1][0], (x[0], x[1][1]))) \
                        .sortByKey(ascending=False)

print(result_rdd.take(10))

rdd_end = time.time()
rdd_time = rdd_end - rdd_start
print(f"RDD execution time = {rdd_time}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[(1002900, ('0344', 'Removes vict property')), (548422, ('1822', 'Stranger')), (404773, ('0416', 'Hit-Hit w/ weapon')), (377536, ('0329', 'Vandalized')), (278618, ('0913', 'Victim knew Suspect')), (256188, ('2000', 'Domestic violence')), (219082, ('1300', 'Vehicle involved')), (213165, ('0400', 'Force used')), (177470, ('1402', 'Evidence Booked (any crime)')), (131229, ('1609', 'Smashed'))]
RDD execution time = 18.17698621749878

In [8]:
# Time comparison DF vs RDD
print(f"""
Method | Total execution time
-------|---------------------
DataFrame | {df_time}
RDD | {rdd_time}
""")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Method | Total execution time
-------|---------------------
DataFrame | 14.23582410812378
RDD | 18.17698621749878

Method | Total execution time (s)
-------|---------------------
DataFrame | 14.23582410812378
RDD | 18.17698621749878

In [7]:
print(f"join = {df_time}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

join = 14.515042543411255

Join Method | Time (s)
------------|---------
SortMergeJoin | 14.949116468429565
ShuffleHashJoin | 14.515042543411255
CartesianProduct | 14.335679054260254