Ζητούμενο 2:

In [1]:
#  1: Dataframe api implementation
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, year, col, count as F_count, sum as F_sum, round as F_round, row_number
from pyspark.sql.window import Window
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
import time 

spark = SparkSession \
    .builder \
    .appName("dataframe query 2 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate() \

sc = spark.sparkContext

start_time = time.time()

crime1_df= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
        header=True, inferSchema=True)
crime2_df= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
        header=True, inferSchema=True)
crime_df= crime1_df.union(crime2_df)

# extract the year from DATE OCC
crime_df_year = crime_df.withColumn(
    "Year",
    year(to_timestamp(col("DATE OCC"), "yyyy MMM dd hh:mm:ss a"))
).filter(col("Year").isNotNull())

# victim per descent
counts_df = (
    crime_df_year
    .groupBy("Year", "Vict Descent")
    .agg(F_count("*").alias("#"))
)
# total victims for %
total_victims_df = (
    counts_df
    .groupby("Year")
    .agg(F_sum("#").alias("total"))
)
percent = (
    counts_df
    .join(total_victims_df, on="Year", how="inner")
    .withColumn("%", F_round((col("#") / col("total")) * 100, 1))
    .select("Year", "Vict Descent", "#", "%")
)
years = [row["Year"] for row in percent.select("Year").distinct().collect()]
for y in sorted(years, reverse=True):
    print(f"\nYear {y}")
    top3 = (
        percent
        .filter(col("Year") == y)
        .orderBy(col("#").desc())
        .limit(3)
        .select("Year", "Vict Descent", "#", "%")
    )
    top3.show(truncate=False)

end_time = time.time()
print(f"\nDataframe api implementation time: {end_time-start_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1107,application_1765289937462_1100,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Year 2025
+----+------------+---+----+
|Year|Vict Descent|#  |%   |
+----+------------+---+----+
|2025|H           |34 |35.1|
|2025|X           |24 |24.7|
|2025|W           |13 |13.4|
+----+------------+---+----+


Year 2024
+----+------------+-----+----+
|Year|Vict Descent|#    |%   |
+----+------------+-----+----+
|2024|NULL        |29204|22.9|
|2024|H           |28576|22.4|
|2024|W           |22958|18.0|
+----+------------+-----+----+


Year 2023
+----+------------+-----+----+
|Year|Vict Descent|#    |%   |
+----+------------+-----+----+
|2023|H           |69401|29.9|
|2023|W           |44615|19.2|
|2023|NULL        |31497|13.6|
+----+------------+-----+----+


Year 2022
+----+------------+-----+----+
|Year|Vict Descent|#    |%   |
+----+------------+-----+----+
|2022|H           |73111|31.1|
|2022|W           |46695|19.8|
|2022|B           |34634|14.7|
+----+------------+-----+----+


Year 2021
+----+------------+-----+----+
|Year|Vict Descent|#    |%   |
+----+------------+-----+

In [1]:
#  2: sql api implementation
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
import time

spark = SparkSession \
    .builder \
    .appName("sql query 2 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate() 

start_time = time.time()

crime1_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=True,
    inferSchema=True
)
crime2_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=True,
    inferSchema=True
)

crime_df = crime1_df.union(crime2_df)
# use as sql table
crime_df.createOrReplaceTempView("crime")

spark.sql("""
CREATE OR REPLACE TEMP VIEW percent AS
SELECT
  year(to_timestamp(`DATE OCC`, 'yyyy MMM dd hh:mm:ss a')) AS Year,
  COALESCE(NULLIF(`Vict Descent`, 'NULL'), 'Unknown') AS `Vict Descent`,
  COUNT(*) AS `#`
FROM crime
WHERE `DATE OCC` IS NOT NULL
GROUP BY Year, `Vict Descent`
""")

spark.sql("""
CREATE OR REPLACE TEMP VIEW percent_full AS
SELECT
  p.Year,
  p.`Vict Descent`,
  p.`#`,
  ROUND(p.`#` / SUM(p.`#`) OVER (PARTITION BY p.Year) * 100, 1) AS `%`
FROM percent p
""")
years = [r["Year"] for r in spark.sql(
    "SELECT DISTINCT Year FROM percent_full WHERE Year IS NOT NULL"
).collect()]
for y in sorted(years, reverse=True):
    print(f"\nYear {y}")
    spark.sql(f"""
        SELECT Year, `Vict Descent`, `#`, `%`
        FROM percent_full
        WHERE Year = {y}
        ORDER BY `#` DESC
        LIMIT 3
    """).show(truncate=False)
end_time = time.time()
print(f"\nSQL api implementation time: {end_time-start_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1109,application_1765289937462_1102,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Year 2025
+----+------------+---+----+
|Year|Vict Descent|#  |%   |
+----+------------+---+----+
|2025|H           |34 |35.1|
|2025|X           |24 |24.7|
|2025|W           |13 |13.4|
+----+------------+---+----+


Year 2024
+----+------------+-----+----+
|Year|Vict Descent|#    |%   |
+----+------------+-----+----+
|2024|Unknown     |29204|22.9|
|2024|H           |28576|22.4|
|2024|W           |22958|18.0|
+----+------------+-----+----+


Year 2023
+----+------------+-----+----+
|Year|Vict Descent|#    |%   |
+----+------------+-----+----+
|2023|H           |69401|29.9|
|2023|W           |44615|19.2|
|2023|Unknown     |31497|13.6|
+----+------------+-----+----+


Year 2022
+----+------------+-----+----+
|Year|Vict Descent|#    |%   |
+----+------------+-----+----+
|2022|H           |73111|31.1|
|2022|W           |46695|19.8|
|2022|B           |34634|14.7|
+----+------------+-----+----+


Year 2021
+----+------------+-----+----+
|Year|Vict Descent|#    |%   |
+----+------------+-----+