In [4]:
import duckdb

%load_ext sql
conn = duckdb.connect(database=":memory:", read_only=False)

%sql conn --alias duckdb

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [5]:
%%sql

CREATE OR REPLACE TABLE service_requests_2024 AS 
    SELECT * FROM  './data/cityofnewyork/service_requests_2024.parquet'

Unnamed: 0,Count
0,3446563


In [6]:
%%sql

CREATE OR REPLACE TABLE modzcta AS
    SELECT 
        MODZCTA,
        label,
        STRING_SPLIT(ZCTA, ', ') AS ZCTA,
        pop_est, 
        the_geom,
    FROM './data/cityofnewyork/modzcta.parquet'

Unnamed: 0,Count
0,178


In [None]:
%%sql

-- This query identifies the top 10 MODZCTAs with the highest number of service requests per capita for any given month in 2024.
-- using the Interquartile Range (IQR) method to detect outliers.

WITH monthly_events AS (
    SELECT 
        m.MODZCTA,
        DATE_TRUNC('month', sr.created_date) AS event_month,
        COUNT(*) / NULLIF(m.pop_est, 0) AS events_per_pop
    FROM service_requests_2024 sr
    LEFT JOIN modzcta m
        ON array_contains(m.ZCTA, sr.incident_zip)
    GROUP BY 
        m.MODZCTA,
        m.pop_est,
        event_month
),
monthly_bounds AS (
    SELECT 
        event_month,
        -- Calculate the 75th and 25th percentiles for the events per population
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY events_per_pop) AS Q3,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY events_per_pop) AS Q1
    FROM monthly_events
    GROUP BY event_month
)
SELECT 
    me.MODZCTA,
    me.event_month,
    ROUND(me.events_per_pop, 4) AS events_per_capita
FROM monthly_events me
JOIN monthly_bounds mb 
    ON me.event_month = mb.event_month
-- Find outliers with IQR method    
WHERE me.events_per_pop > mb.Q3 + 1.5 * (mb.Q3 - mb.Q1)
   OR me.events_per_pop < mb.Q1 - 1.5 * (mb.Q3 - mb.Q1)
QUALIFY ROW_NUMBER() OVER (PARTITION BY MODZCTA ORDER BY events_per_pop DESC) = 1
ORDER BY events_per_capita DESC
LIMIT 10;

Unnamed: 0,MODZCTA,event_month,events_per_capita
0,10466,2024-12-01,0.2748
1,10004,2024-10-01,0.2414
2,11239,2024-12-01,0.1662
3,11366,2024-06-01,0.1576
4,10006,2024-07-01,0.1094
5,11101,2024-09-01,0.1011
6,10018,2024-10-01,0.095
7,10007,2024-09-01,0.0917
8,10464,2024-06-01,0.0862
9,10036,2024-10-01,0.0808


In [24]:
%%sql

CREATE OR REPLACE TABLE llm_categorize_output_2024 AS 
    SELECT * FROM './output/llm_categorize_output_2024.csv';

WITH categorized_events AS (
    SELECT 
        m.MODZCTA,
        DATE_TRUNC('month', sr.created_date) AS event_month,
        co.category,
        co.subcategory,
        COUNT(*) / NULLIF(m.pop_est, 0) AS events_per_pop
    FROM service_requests_2024 sr
    LEFT JOIN modzcta m
        ON array_contains(m.ZCTA, sr.incident_zip)
    LEFT JOIN llm_categorize_output_2024 co
        ON sr.agency = co.agency
        AND sr.complaint_type = co.complaint_type
        AND sr.descriptor = co.descriptor
    GROUP BY 
        m.MODZCTA,
        m.pop_est,
        event_month,
        co.category,
        co.subcategory
),
categorized_bounds AS (
    SELECT 
        event_month,
        category,
        subcategory,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY events_per_pop) AS Q3,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY events_per_pop) AS Q1
    FROM categorized_events
    GROUP BY event_month, category, subcategory
)
SELECT 
    ce.MODZCTA,
    ce.event_month,
    ce.category,
    ce.subcategory,
    ROUND(ce.events_per_pop, 4) AS events_per_capita
FROM categorized_events ce
JOIN categorized_bounds cb 
    ON ce.event_month = cb.event_month
    AND ce.category = cb.category
    AND ce.subcategory = cb.subcategory
WHERE ce.events_per_pop > cb.Q3 + 1.5 * (cb.Q3 - cb.Q1)
   OR ce.events_per_pop < cb.Q1 - 1.5 * (cb.Q3 - cb.Q1)
QUALIFY ROW_NUMBER() OVER (PARTITION BY MODZCTA, ce.category, ce.subcategory ORDER BY events_per_pop DESC) = 1
ORDER BY events_per_capita DESC
LIMIT 10;

Unnamed: 0,MODZCTA,event_month,category,subcategory,events_per_capita
0,10466,2024-12-01,Public Safety & Order,Noise & Disturbances,0.2399
1,11239,2024-12-01,Public Safety & Order,Parking,0.1318
2,11366,2024-02-01,Public Safety & Order,Non-Emergency Police Matters,0.0881
3,10004,2024-09-01,Consumer & Business Services,Consumer Complaints,0.0849
4,10004,2024-10-01,Public Safety & Order,Noise & Disturbances,0.0769
5,10004,2024-07-01,Public Safety & Order,Parking,0.0515
6,11366,2024-04-01,Public Safety & Order,Noise & Disturbances,0.0513
7,10006,2024-05-01,Consumer & Business Services,Consumer Complaints,0.051
8,11101,2024-09-01,Public Safety & Order,Parking,0.0458
9,11366,2024-08-01,Consumer & Business Services,Transportation Services,0.045
