In [4]:
# Spark RDD code
from pyspark.sql import SparkSession
# To log our application's execution time:
import time
import csv
from io import StringIO

#Spark Session Config for 4 executors
sc = SparkSession \
    .builder \
    .appName("RDD query 1 execution") \
    .config("spark.executor.instances", "4") \
    .getOrCreate() \
    .sparkContext

def parse_csv_line(line):
    f = StringIO(line)
    reader = csv.reader(f)
    return next(reader)

# Age group converter
def age_group(data):
    try:
        age=int(data)
        if age < 18 and age > 0:
            return "Children"
        if age<25:
            return "Young adults"
        if age<65 :
            return "Adults"
        if age>64:
            return "Elders"
        else:
            return "No individual victim"
    except:
        return "error"
    
#Start timer
start_time = time.time()

#Read data from both datasets (10-19 and 20-present)
data_rdd = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_*.csv") \
             .map(parse_csv_line)
#Remove Header
header = data_rdd.first()
data_rdd = data_rdd.filter(lambda row: row != header)

# Search for Aggravated, Convert by Age group and Sum in Descending order
crime_data = data_rdd \
    .filter(lambda row: "AGGRAVATED" in row[9]) \
    .map(lambda row: (age_group(row[11]), 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda pair: pair[1], ascending=False)

print(crime_data.collect())

end_time = time.time()
print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('Adults', 121093), ('Young adults', 38703), ('Children', 10830), ('Elders', 5985)]
Time taken: 24.65 seconds

In [23]:
# Spark DataFrame code
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, TimestampType
from pyspark.sql.functions import col, udf
import time
import csv
from io import StringIO
from pyspark.sql.functions import desc

#Spark Session Config for 4 executors
spark = SparkSession \
    .builder \
    .appName("DF query 1 execution") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

conf = spark.sparkContext.getConf()
print("Executor Instances:", conf.get("spark.executor.instances"))


# Age group converter
def age_group(data):
    try:
        age=int(data)
        if age < 18 and age > 0:
            return "Children"
        if age<25:
            return "Young adults"
        if age<65 :
            return "Adults"
        if age>64:
            return "Elders"
        else:
            return "No individual victim"
    except:
        return "error"

#Start timer
start_time = time.time()


# Load Crime Data into DataFrame
crime_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_*.csv",
    header=True,
    inferSchema=True
)

#Filter AGGRAVATEED ASSAULT
aggravated_df = crime_df.filter(col("Crm Cd Desc").like("%AGGRAVATED ASSAULT%"))

# UDF AgeGroup
age_group_udf = udf(age_group, StringType())
aggravated_df = aggravated_df.withColumn("Age_group", age_group_udf(col("Vict Age")))
age_group_count_df = aggravated_df.groupBy("Age_group").count()
result_df = age_group_count_df.orderBy(desc("count"))


result_df.show()

end_time = time.time()
print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: None
+------------+------+
|   Age_group| count|
+------------+------+
|      Adults|121093|
|Young adults| 38703|
|    Children| 10830|
|      Elders|  5985|
+------------+------+

Time taken: 12.97 seconds