In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, TimestampType
from pyspark.sql.functions import col, udf
import time
import csv


#Spark Session Config for 4 executors
sc = SparkSession \
    .builder \
    .appName("SQL API query 2 execution") \
    .getOrCreate() \
    
#Start timer
start_time = time.time()

#Read data from both datasets (10-19 and 20-present)
crime_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_*.csv",
    header=True,
    inferSchema=True
)
crime_df.createOrReplaceTempView("crime_df")
      
query="""
    WITH initial_data AS (
        SELECT 
            SUBSTR(`Date Rptd`, 7, 4) AS year,
            `AREA NAME` AS precinct,
            Status
    FROM crime_df
),
    ranked_data AS (
        SELECT
            year,
            precinct,
            (COUNT(CASE WHEN Status != 'IC' THEN 1 END) * 100) / COUNT(*) AS closed_case_rate,
            ROW_NUMBER() OVER (PARTITION BY year ORDER BY (COUNT(CASE WHEN Status != 'IC' THEN 1 END) * 100) / COUNT(*) DESC) AS `#`
    FROM initial_data
    GROUP BY year, precinct
    )
    SELECT 
        year,
        precinct,
        closed_case_rate,
        `#`
    FROM ranked_data
    WHERE `#` < 4;
"""
results= spark.sql(query)
results.show(45)
end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+------------------+---+
|year|   precinct|  closed_case_rate|  #|
+----+-----------+------------------+---+
|2010|    Rampart|32.947355855318136|  1|
|2010|    Olympic|31.962706191728426|  2|
|2010|     Harbor| 29.63203463203463|  3|
|2011|    Olympic|35.212167689161554|  1|
|2011|    Rampart|32.511779630300836|  2|
|2011|     Harbor| 28.65220520201501|  3|
|2012|    Olympic|34.414818310523835|  1|
|2012|    Rampart|  32.9464181029429|  2|
|2012|     Harbor|29.815133276010318|  3|
|2013|    Olympic| 33.52812271731191|  1|
|2013|    Rampart| 32.08287360549222|  2|
|2013|     Harbor|29.164224592662055|  3|
|2014|   Van Nuys| 31.80567315834039|  1|
|2014|West Valley|31.311989956057754|  2|
|2014|    Mission|31.162790697674417|  3|
|2015|   Van Nuys|32.641346981727736|  1|
|2015|West Valley|30.275974025974026|  2|
|2015|    Mission|30.179460678380156|  3|
|2016|   Van Nuys|31.880755720117726|  1|
|2016|West Valley| 31.54798761609907|  2|
|2016|   Foothill|29.8702918433524