## SQL API - Sorted AGGRAVATED ASSAULT CRIMES BY AGE GROUP

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import time

# Δημιουργία SparkSession
spark = SparkSession \
    .builder \
    .appName("Query 1 execution") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

# Φόρτωση δεδομένων από CSV
data = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header='true'
)
data_2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header='true'
)
print("Schema of data_1")
data.printSchema()

print("Schema of data_2")
data_2.printSchema()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
353,application_1732639283265_0322,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Schema of data_1
root
 |-- DR_NO: string (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: string (nullable = true)
 |-- AREA : string (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: string (nullable = true)
 |-- Part 1-2: string (nullable = true)
 |-- Crm Cd: string (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: string (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: string (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: string (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: string (nullable = true)
 |-- Crm Cd 2: string (nullable = true)
 |-- Crm Cd 3: string (nullable = true)
 |-- Crm Cd 4: string (nullable = true)
 

In [2]:
# Χρονική μέτρηση
start_time = time.time()

# Καταχώρηση ως προσωρινό view για χρήση SQL
data.createOrReplaceTempView("data")
data_2.createOrReplaceTempView("data_2")

id_query = """
    SELECT a.DR_NO, CAST(a.`Vict Age` AS INT) AS `Vict Age`, a.`Crm Cd Desc`
    FROM data a
    LEFT JOIN data_2 b ON a.DR_NO = b.DR_NO
    WHERE (a.`Crm Cd Desc` = 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT' 
           OR a.`Crm Cd Desc` = 'INTIMATE PARTNER - AGGRAVATED ASSAULT')
      AND CAST(a.`Vict Age` AS INT) BETWEEN 0 AND 17
    ORDER BY `Vict Age`
"""

# Εκτέλεση του query
sorted_data = spark.sql(id_query)

# Εμφάνιση των πρώτων αποτελεσμάτων
# Εμφάνιση όλων των δεδομένων
#print("Sorted by Youth Adult")
#for row in sorted_data.collect():
#    print(row)
# Αποθήκευση δεδομένων σε ένα μόνο αρχείο Parquet
group_number = "9"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/Query1_SQL/Children/"

# Χρήση coalesce(1) για να αποθηκεύσουμε τα δεδομένα σε ένα αρχείο
sorted_data.coalesce(1).write.mode("overwrite").parquet(s3_path)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
id_query = """
    SELECT a.DR_NO, CAST(a.`Vict Age` AS INT) AS `Vict Age`, a.`Crm Cd Desc`
    FROM data a
    LEFT JOIN data_2 b ON a.DR_NO = b.DR_NO
    WHERE (a.`Crm Cd Desc` = 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT' 
           OR a.`Crm Cd Desc` = 'INTIMATE PARTNER - AGGRAVATED ASSAULT')
      AND CAST(a.`Vict Age` AS INT) BETWEEN 18 AND 24
    ORDER BY `Vict Age`
"""

# Εκτέλεση του query
sorted_data = spark.sql(id_query)

# Εμφάνιση όλων των δεδομένων
#print("Sorted by Youth Adult")
#for row in sorted_data.collect():
#    print(row)
group_number = "9"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/Query1_SQL/Youth_Adult/"
sorted_data.coalesce(1).write.mode("overwrite").parquet(s3_path)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
id_query = """
    SELECT a.DR_NO, CAST(a.`Vict Age` AS INT) AS `Vict Age`, a.`Crm Cd Desc`
    FROM data a
    LEFT JOIN data_2 b ON a.DR_NO = b.DR_NO
    WHERE (a.`Crm Cd Desc` = 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT' 
           OR a.`Crm Cd Desc` = 'INTIMATE PARTNER - AGGRAVATED ASSAULT')
      AND CAST(a.`Vict Age` AS INT) BETWEEN 25 AND 64
    ORDER BY `Vict Age`
"""

# Εκτέλεση του query
sorted_data = spark.sql(id_query)

# Εμφάνιση όλων των δεδομένων
#print("Sorted by Adult")
#for row in sorted_data.collect():
#    print(row)

group_number = "9"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/Query1_SQL/Adult/"
sorted_data.coalesce(1).write.mode("overwrite").parquet(s3_path)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
id_query = """
    SELECT a.DR_NO, CAST(a.`Vict Age` AS INT) AS `Vict Age`, a.`Crm Cd Desc`
    FROM data a
    LEFT JOIN data_2 b ON a.DR_NO = b.DR_NO
    WHERE (a.`Crm Cd Desc` = 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT' 
           OR a.`Crm Cd Desc` = 'INTIMATE PARTNER - AGGRAVATED ASSAULT')
      AND CAST(a.`Vict Age` AS INT) > 64
    ORDER BY `Vict Age`
"""

# Εκτέλεση του query
sorted_data = spark.sql(id_query)


# Εμφάνιση όλων των δεδομένων
#print("Sorted by Youth Elderly")
#for row in sorted_data.collect():
#   print(row)
group_number = "9"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/Query1_SQL/Elderly/"
sorted_data.coalesce(1).write.mode("overwrite").parquet(s3_path)
# Υπολογισμός και εκτύπωση χρόνου εκτέλεσης
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken: 22.95 seconds