## DataFrame API - Sorted AGGRAVATED ASSAULT CRIMES BY AGE GROUP

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import time

# Δημιουργία SparkSession
spark = SparkSession \
    .builder \
    .appName("Query 1 execution") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Φόρτωση δεδομένων από CSV
data = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header='true'
)
data_2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header='true'
)
print("Schema of data_1")
data.printSchema()

print("Schema of data_2")
data_2.printSchema()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3476,application_1732639283265_3432,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Schema of data_1
root
 |-- DR_NO: string (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: string (nullable = true)
 |-- AREA : string (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: string (nullable = true)
 |-- Part 1-2: string (nullable = true)
 |-- Crm Cd: string (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: string (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: string (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: string (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: string (nullable = true)
 |-- Crm Cd 2: string (nullable = true)
 |-- Crm Cd 3: string (nullable = true)
 |-- Crm Cd 4: string (nullable = true)
 

In [2]:
# Χρονική μέτρηση
start_time = time.time()

# Φιλτράρισμα δεδομένων για να περιλαμβάνει μόνο ακέραιες τιμές στη στήλη "Vict Age" και ηλικίες από 65 και πάνω
filtered_data = data.filter(
    (data["Crm Cd Desc"].isin(
        "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",
        "INTIMATE PARTNER - AGGRAVATED ASSAULT"
    )) & 
    (data["Vict Age"].rlike("^[0-9]+$")) &  # Έλεγχος αν το "Vict Age" περιέχει μόνο αριθμούς
    (data["Vict Age"].cast("int").between(65, float('inf')))  # Επιπλέον φίλτρο για ηλικίες 65 και πάνω
).select(
    data["DR_NO"],
    data["Vict Age"].cast("int").alias("Vict_Age"),
    data["Crm Cd Desc"]
)

# Φιλτράρισμα δεδομένων για το δεύτερο DataFrame με παρόμοιο τρόπο
filtered_data_2 = data_2.filter(
    (data_2["Crm Cd Desc"].isin(
        "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",
        "INTIMATE PARTNER - AGGRAVATED ASSAULT"
    )) & 
    (data_2["Vict Age"].rlike("^[0-9]+$")) &  # Έλεγχος αν το "Vict Age" περιέχει μόνο αριθμούς
    (data_2["Vict Age"].cast("int").between(65, float('inf')))  # Επιπλέον φίλτρο για ηλικίες 65 και πάνω
).select(
    data_2["DR_NO"],
    data_2["Vict Age"].cast("int").alias("Vict_Age"),
    data_2["Crm Cd Desc"]
)

# Συνένωση των φιλτραρισμένων δεδομένων
combined_data = filtered_data.union(filtered_data_2)

# Ταξινόμηση με βάση την ηλικία
sorted_data = combined_data.orderBy(combined_data["Vict_Age"].desc())

for item in sorted_data.take(20):
    print(item)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(DR_NO='200808494', Vict_Age=120, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='180713962', Vict_Age=99, Crm Cd Desc='INTIMATE PARTNER - AGGRAVATED ASSAULT')
Row(DR_NO='101408873', Vict_Age=99, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='180412016', Vict_Age=99, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='170207284', Vict_Age=99, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='131209628', Vict_Age=99, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='170132132', Vict_Age=99, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='191406441', Vict_Age=99, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='171805899', Vict_Age=99, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='131012876', Vict_Age=99, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='140511426', Vict_

In [3]:
# Φιλτράρισμα δεδομένων για να περιλαμβάνει μόνο ακέραιες τιμές στη στήλη "Vict Age" και ηλικίες από 25 έως 64
filtered_data = data.filter(
    (data["Crm Cd Desc"].isin(
        "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",
        "INTIMATE PARTNER - AGGRAVATED ASSAULT"
    )) & 
    (data["Vict Age"].rlike("^[0-9]+$")) &  # Έλεγχος αν το "Vict Age" περιέχει μόνο αριθμούς
    (data["Vict Age"].cast("int").between(25, 64))  # Επιπλέον φίλτρο για ηλικίες 25-64
).select(
    data["DR_NO"],
    data["Vict Age"].cast("int").alias("Vict_Age"),
    data["Crm Cd Desc"]
)

# Φιλτράρισμα δεδομένων για το δεύτερο DataFrame με παρόμοιο τρόπο
filtered_data_2 = data_2.filter(
    (data_2["Crm Cd Desc"].isin(
        "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",
        "INTIMATE PARTNER - AGGRAVATED ASSAULT"
    )) & 
    (data_2["Vict Age"].rlike("^[0-9]+$")) &  # Έλεγχος αν το "Vict Age" περιέχει μόνο αριθμούς
    (data_2["Vict Age"].cast("int").between(25, 64))  # Επιπλέον φίλτρο για ηλικίες 25-64
).select(
    data_2["DR_NO"],
    data_2["Vict Age"].cast("int").alias("Vict_Age"),
    data_2["Crm Cd Desc"]
)

# Συνένωση των φιλτραρισμένων δεδομένων
combined_data = filtered_data.union(filtered_data_2)

# Ταξινόμηση με βάση την ηλικία
sorted_data = combined_data.orderBy(combined_data["Vict_Age"].desc())

for item in sorted_data.take(20):
    print(item)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(DR_NO='131805923', Vict_Age=64, Crm Cd Desc='INTIMATE PARTNER - AGGRAVATED ASSAULT')
Row(DR_NO='170304120', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='231406706', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='160315156', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='231910301', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='160117989', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='230311571', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='160210743', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='231517681', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='161317074', Vict_Age=64, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='231008517', Vict_A

In [4]:
# Φιλτράρισμα δεδομένων για να περιλαμβάνει μόνο ακέραιες τιμές στη στήλη "Vict Age" και ηλικίες από 18 έως 24
filtered_data = data.filter(
    (data["Crm Cd Desc"].isin(
        "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",
        "INTIMATE PARTNER - AGGRAVATED ASSAULT"
    )) & 
    (data["Vict Age"].rlike("^[0-9]+$")) &  # Έλεγχος αν το "Vict Age" περιέχει μόνο αριθμούς
    (data["Vict Age"].cast("int").between(18, 24))  # Επιπλέον φίλτρο για ηλικίες 18-24
).select(
    data["DR_NO"],
    data["Vict Age"].cast("int").alias("Vict_Age"),
    data["Crm Cd Desc"]
)

# Φιλτράρισμα δεδομένων για το δεύτερο DataFrame με παρόμοιο τρόπο
filtered_data_2 = data_2.filter(
    (data_2["Crm Cd Desc"].isin(
        "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",
        "INTIMATE PARTNER - AGGRAVATED ASSAULT"
    )) & 
    (data_2["Vict Age"].rlike("^[0-9]+$")) &  # Έλεγχος αν το "Vict Age" περιέχει μόνο αριθμούς
    (data_2["Vict Age"].cast("int").between(18, 24))  # Επιπλέον φίλτρο για ηλικίες 18-24
).select(
    data_2["DR_NO"],
    data_2["Vict Age"].cast("int").alias("Vict_Age"),
    data_2["Crm Cd Desc"]
)

# Συνένωση των φιλτραρισμένων δεδομένων
combined_data = filtered_data.union(filtered_data_2)

# Ταξινόμηση με βάση την ηλικία
sorted_data = combined_data.orderBy(combined_data["Vict_Age"].desc())

for item in sorted_data.take(20):
    print(item)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(DR_NO='101918849', Vict_Age=24, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='221816811', Vict_Age=24, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='221813012', Vict_Age=24, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='222005646', Vict_Age=24, Crm Cd Desc='INTIMATE PARTNER - AGGRAVATED ASSAULT')
Row(DR_NO='230400960', Vict_Age=24, Crm Cd Desc='INTIMATE PARTNER - AGGRAVATED ASSAULT')
Row(DR_NO='221909539', Vict_Age=24, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='221821668', Vict_Age=24, Crm Cd Desc='INTIMATE PARTNER - AGGRAVATED ASSAULT')
Row(DR_NO='220900927', Vict_Age=24, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='231301013', Vict_Age=24, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='220319017', Vict_Age=24, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='220414808', Vict_Age=24, Crm Cd Desc

In [5]:
# Φιλτράρισμα δεδομένων για να περιλαμβάνει μόνο ακέραιες τιμές στη στήλη "Vict Age" και ηλικίες από 0 έως 17
filtered_data = data.filter(
    (data["Crm Cd Desc"].isin(
        "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",
        "INTIMATE PARTNER - AGGRAVATED ASSAULT"
    )) & 
    (data["Vict Age"].rlike("^[0-9]+$")) &  # Έλεγχος αν το "Vict Age" περιέχει μόνο αριθμούς
    (data["Vict Age"].cast("int").between(0, 17))  # Επιπλέον φίλτρο για ηλικίες 0-17
).select(
    data["DR_NO"],
    data["Vict Age"].cast("int").alias("Vict_Age"),
    data["Crm Cd Desc"]
)

# Φιλτράρισμα δεδομένων για το δεύτερο DataFrame με παρόμοιο τρόπο
filtered_data_2 = data_2.filter(
    (data_2["Crm Cd Desc"].isin(
        "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",
        "INTIMATE PARTNER - AGGRAVATED ASSAULT"
    )) & 
    (data_2["Vict Age"].rlike("^[0-9]+$")) &  # Έλεγχος αν το "Vict Age" περιέχει μόνο αριθμούς
    (data_2["Vict Age"].cast("int").between(0, 17))  # Επιπλέον φίλτρο για ηλικίες 0-17
).select(
    data_2["DR_NO"],
    data_2["Vict Age"].cast("int").alias("Vict_Age"),
    data_2["Crm Cd Desc"]
)

# Συνένωση των φιλτραρισμένων δεδομένων
combined_data = filtered_data.union(filtered_data_2)

# Ταξινόμηση με βάση την ηλικία
sorted_data = combined_data.orderBy(combined_data["Vict_Age"].desc())

for item in sorted_data.take(20):
    print(item)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(DR_NO='161217859', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='231700766', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='220901179', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='231205118', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='211107303', Vict_Age=17, Crm Cd Desc='INTIMATE PARTNER - AGGRAVATED ASSAULT')
Row(DR_NO='232108079', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='221000545', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='231604081', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='240307102', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='230400866', Vict_Age=17, Crm Cd Desc='ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT')
Row(DR_NO='221205673', Vict_A