Ζητούμενο 2:

In [1]:
#  1: Dataframe api implementation
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, year, col, count as F_count, sum as F_sum, round as F_round, row_number
from pyspark.sql.window import Window
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
import time 

spark = SparkSession \
    .builder \
    .appName("dataframe query 2 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate() \

sc = spark.sparkContext

start_time = time.time()

crime1_df= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
        header=True, inferSchema=True)
crime2_df= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
        header=True, inferSchema=True)
crime_df= crime1_df.union(crime2_df)

# extract the year from DATE OCC
crime_df_year = crime_df \
    .withColumn("Year", year(to_timestamp(col("DATE OCC"), "yyyy MMM dd hh:mm:ss a"))) \
    .filter(col("Year").isNotNull()) \
    .select("Year", "Vict Descent")

# victim per descent
counts_df = (
    crime_df_year
    .groupBy("Year", "Vict Descent")
    .agg(F_count("*").alias("count"))
)

window_year_total = Window.partitionBy("Year")
window_ranking = Window.partitionBy("Year").orderBy(col("count").desc())

final_df = counts_df \
    .withColumn("total_year", F_sum("count").over(window_year_total)) \
    .withColumn("%", F_round((col("count") / col("total_year") * 100), 1)) \
    .withColumn("rank", row_number().over(window_ranking)) \
    .filter(col("rank") <= 3) \
    .select(
        col("Year"), 
        col("Vict Descent"), 
        col("count").alias("#"),
        col("%")
    ) \
    .orderBy(col("Year").desc(), col("#").desc())

results = final_df.collect()
end_time = time.time()

current_year = None
for row in results:
    if row['Year'] != current_year:
        print(f"\nYear {row['Year']}")
        print(f"{'Vict Descent':<25} {'#':<10} {'%':<5}")
        current_year = row['Year']
    descent = str(row['Vict Descent']) if row['Vict Descent'] is not None else "Unknown"
    count_val = row['#'] if row['#'] is not None else 0
    percent_val = row['%'] if row['%'] is not None else 0.0
    print(f"{descent:<25} {count_val:<10} {percent_val:<5}")

print(f"\nDataframe api implementation time: {end_time-start_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1755,application_1765289937462_1739,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Year 2025
Vict Descent              #          %    
H                         34         35.1 
X                         24         24.7 
W                         13         13.4 

Year 2024
Vict Descent              #          %    
Unknown                   29204      22.9 
H                         28576      22.4 
W                         22958      18.0 

Year 2023
Vict Descent              #          %    
H                         69401      29.9 
W                         44615      19.2 
Unknown                   31497      13.6 

Year 2022
Vict Descent              #          %    
H                         73111      31.1 
W                         46695      19.8 
B                         34634      14.7 

Year 2021
Vict Descent              #          %    
H                         63676      30.3 
W                         44523      21.2 
B                         30173      14.4 

Year 2020
Vict Descent              #          %    
H                         61606

In [1]:
#  2: sql api implementation
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
import time

spark = SparkSession \
    .builder \
    .appName("sql query 2 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate() 

start_time = time.time()

crime1_df = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
)
crime2_df = spark.read.option("header", "true").csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"
)

crime_df = crime1_df.union(crime2_df)
crime_df.createOrReplaceTempView("crime")

final_df = spark.sql("""
    WITH BaseCounts AS (
        SELECT 
            year(to_timestamp(`DATE OCC`, 'yyyy MMM dd hh:mm:ss a')) AS Year,
            `Vict Descent`, 
            COUNT(*) as count
        FROM crime
        WHERE `DATE OCC` IS NOT NULL
        GROUP BY Year, `Vict Descent`
    ),
    RankedCounts AS (
        SELECT 
            Year,
            `Vict Descent`,
            count AS `#`,
            ROUND(count / SUM(count) OVER (PARTITION BY Year) * 100, 1) AS `%`,
            ROW_NUMBER() OVER (PARTITION BY Year ORDER BY count DESC) as rn
        FROM BaseCounts
        WHERE Year IS NOT NULL
    )
    SELECT Year, `Vict Descent`, `#`, `%`
    FROM RankedCounts
    WHERE rn <= 3
    ORDER BY Year DESC, `#` DESC
""")

results = final_df.collect()

end_time = time.time()

current_year = None
for row in results:
    if row['Year'] != current_year:
        print(f"\nYear {row['Year']}")
        print(f"{'Vict Descent':<25} {'#':<10} {'%':<5}")
        current_year = row['Year']
    descent = str(row['Vict Descent']) if row['Vict Descent'] is not None else "Unknown"
    count_val = row['#'] if row['#'] is not None else 0
    percent_val = row['%'] if row['%'] is not None else 0.0
    print(f"{descent:<25} {count_val:<10} {percent_val:<5}")

print(f"\nSQL api implementation time: {end_time-start_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1756,application_1765289937462_1740,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Year 2025
Vict Descent              #          %    
H                         34         35.1 
X                         24         24.7 
W                         13         13.4 

Year 2024
Vict Descent              #          %    
Unknown                   29204      22.9 
H                         28576      22.4 
W                         22958      18.0 

Year 2023
Vict Descent              #          %    
H                         69401      29.9 
W                         44615      19.2 
Unknown                   31497      13.6 

Year 2022
Vict Descent              #          %    
H                         73111      31.1 
W                         46695      19.8 
B                         34634      14.7 

Year 2021
Vict Descent              #          %    
H                         63676      30.3 
W                         44523      21.2 
B                         30173      14.4 

Year 2020
Vict Descent              #          %    
H                         61606