In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
import time

# Create SparkSession
spark = SparkSession.builder \
    .appName("Query1") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Start timing
start_time = time.time()

# Load Data
crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True)
data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)

# View first 10 lines of first dataset (for validation)
data_1.limit(10).show()

# Describe the datasets
data_1.describe().show()    
data_2.describe().show() 

# Print schemas
data_1.printSchema()
data_2.printSchema()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--------+------------+---------+--------------------+--------------+--------------------+------+------------+--------+--------+--------+--------+--------------------+--------------------+-------+---------+
|    DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA |AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|       Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|         Weapon Desc|Status| Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|        Cross Street|    LAT|      LON|
+---------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--------+------------+---------+--------------------+--------------+--------------------+------+------------+--------+--------+--------

In [2]:
# Start timing
start_time = time.time()

# Load Data
crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True)
data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)

# Concatenate datasets
data = data_1.union(data_2)

# Filter data for "aggravated assault"
aggravated_assault_data = data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

# Group by age
categorized_data = aggravated_assault_data.withColumn(
    "Age Group",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
    .when(col("Vict Age") > 64, "Elderly")
)
age_group_counts = categorized_data.groupBy("Age Group").agg(count("Age Group").alias("Count"))

# Sort in decending order
sorted_age_groups = age_group_counts.orderBy(col("Count").desc())

# Show results
sorted_age_groups.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|   Age Group| Count|
+------------+------+
|      Adults|121093|
|Young Adults| 33605|
|    Children| 15928|
|     Elderly|  5985|
+------------+------+

Time taken: 4.87 seconds

In [3]:
# Start timing
start_time = time.time()

# Φόρτωση δεδομένων
crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True)
data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)

# Συνένωση δεδομένων
data = data_1.union(data_2)

# Υλοποίηση με RDD API
rdd = data.rdd

# Φιλτράρισμα για "aggravated assault"
aggravated_assault_rdd = rdd.filter(lambda row: "AGGRAVATED ASSAULT" in row["Crm Cd Desc"])

# Κατηγοριοποίηση ηλικιών
categorized_rdd = aggravated_assault_rdd.map(
    lambda row: (
        "Children" if row["Vict Age"] < 18 else
        "Young Adults" if 18 <= row["Vict Age"] <= 24 else
        "Adults" if 25 <= row["Vict Age"] <= 64 else
        "Elderly",
        1
    )
)

# Ομαδοποίηση και καταμέτρηση περιστατικών
age_group_counts_rdd = categorized_rdd.reduceByKey(lambda a, b: a + b)

# Ταξινόμηση σε φθίνουσα σειρά
sorted_age_groups_rdd = age_group_counts_rdd.sortBy(lambda x: x[1], ascending=False)

# Εμφάνιση αποτελεσμάτων
for group, count in sorted_age_groups_rdd.collect():
    print(f"{group}: {count}")

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Adults: 121093
Young Adults: 33605
Children: 15928
Elderly: 5985
Time taken: 7.92 seconds

In [None]:
spark.stop();