In [10]:
from pyspark.sql import SparkSession
# from pyspark.sql import functions as F
import time

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Query1_RDD") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load crime data
# Load the 2010-2019 crime data
crime_df_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", 
    header=True
)

# Load the 2020-present crime data
crime_df_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", 
    header=True
)
crime_df = crime_df_2010_2019.union(crime_df_2020_present)

crime_rdd = crime_df.rdd


# Function to filter and process the data as you did before
def process_row(row):
    try:
        age = int(row['Vict Age'])  # Access column by name
    except (ValueError, TypeError):
        age = None  # Handle missing or malformed age data
    
    # Validate Latitude and Longitude (ensure they are not null or zero)
    lat = row['LAT']
    lon = row['LON']
    if lat and lon and lat != '0' and lon != '0':
        # Check for "Aggravated Assault" in the crime description
        if "aggravated assault" in row['Crm Cd Desc'].lower().strip():
            # Filtering based on age and classifying into categories
            if age is not None:
                if age < 18:
                    return ('Child', 1)
                elif 18 <= age <= 24:
                    return ('Young Adult', 1)
                elif 25 <= age <= 64:
                    return ('Adult', 1)
                else:
                    return ('Senior', 1)
    return None  # Exclude rows that do not match the criteria


# Apply the processing function to the RDD
age_groups_rdd = crime_rdd.map(process_row).filter(lambda x: x is not None)

# Count the occurrences of each age group (Child, Young Adult, Adult, Senior)
age_groups_count = age_groups_rdd.countByKey()

# Print the results
print("Age Group Counts:", age_groups_count)

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Age Group Counts: defaultdict(<class 'int'>, {'Adult': 121052, 'Young Adult': 33588, 'Child': 15923, 'Senior': 5985})
Time taken: 17.82 seconds