In [29]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import col, when
import time
import csv
from io import StringIO

def get_age_group(age):
    if age < 18:
        return "<18"
    elif 18 <= age <= 24:
        return "18-24"
    elif 25 <= age <= 64:
        return "25-64"
    elif age > 64:
        return ">64"
    else:
        return None

def process_data_with_rdd(s3_link1, s3_link2):
    conf = SparkConf().setAppName("IncidentsPerAgeGroupRDD").setMaster("yarn").set("spark.executor.instances", "4")
    sc = SparkContext.getOrCreate(conf=conf)

    def read_file(s3_link):
        data = sc.textFile(s3_link)
        header = data.first()
        rows = data.filter(lambda row: row != header).map(lambda line: list(csv.reader([line]))[0])
        return rows

    combined_data = read_file(s3_link1).union(read_file(s3_link2))
    aggravated_assault = combined_data.filter(lambda row: "AGGRAVATED ASSAULT" in row[9].strip().upper())
    age_groups = aggravated_assault.map(lambda row: (get_age_group(int(row[11]) if row[11].isdigit() else -1), 1)).filter(lambda x: x[0] is not None)
    age_group_counts = age_groups.reduceByKey(lambda a, b: a + b)
    sorted_results = age_group_counts.sortBy(lambda x: x[1], ascending=False).collect()
    result_dict = dict(sorted_results)
    return result_dict

def process_data_with_dataframe(s3_link1, s3_link2):
    conf = SparkConf().setAppName("IncidentsPerAgeGroupDataFrame").setMaster("yarn").set("spark.executor.instances", "4")
    sc = SparkContext.getOrCreate(conf=conf)
    
    def read_file(s3_link):
        df = spark.read.csv(s3_link, header=True, inferSchema=True)
        return df

    df = read_file(s3_link1).union(read_file(s3_link2))
    df = df.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))
    df = df.withColumn("Age Group", when(col("Vict Age") < 18, "<18")
                                    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "18-24")
                                    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "25-64")
                                    .when(col("Vict Age") > 64, ">64")
                                    .otherwise(None))
    df = df.groupBy("Age Group").count()
    df = df.orderBy(col("count").desc())
    result_dict = {row['Age Group']: row['count'] for row in df.collect()}
    return result_dict

def print_results(results, approach_name):
    print(f"\n{approach_name} Approach:")
    for group, count in results.items():
        print(f"Age Group: {group}, Count: {count}")

s3_link1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
s3_link2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

start_time_rdd = time.time()
results_rdd = process_data_with_rdd(s3_link1, s3_link2)
end_time_rdd = time.time()
execution_time_rdd = end_time_rdd - start_time_rdd
print_results(results_rdd, "RDD")
print(f"Total execution time (RDD): {execution_time_rdd:.2f} seconds")

start_time_df = time.time()
results_df = process_data_with_dataframe(s3_link1, s3_link2)
end_time_df = time.time()
execution_time_df = end_time_df - start_time_df
print_results(results_df, "DataFrame")
print(f"Total execution time (DataFrame): {execution_time_df:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


RDD Approach:
Age Group: 25-64, Count: 121093
Age Group: 18-24, Count: 33605
Age Group: <18, Count: 15928
Age Group: >64, Count: 5985
Total execution time (RDD): 25.94 seconds

DataFrame Approach:
Age Group: 25-64, Count: 121093
Age Group: 18-24, Count: 33605
Age Group: <18, Count: 15928
Age Group: >64, Count: 5985
Total execution time (DataFrame): 9.38 seconds