# Polars & DuckDB: When Pandas Wont Cut It
--------------------------

In [1]:
import polars as pl

In [2]:
df = pl.read_csv("https://data.cityofnewyork.us/resource/h9gi-nx95.csv")

In [3]:
df.head(2)

crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
str,str,str,i64,f64,f64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str
"""2021-09-11T00:…","""2:39""",,,,,,"""WHITESTONE EXP…","""20 AVENUE""",,2,0,0,0,0,0,2,0,"""Aggressive Dri…","""Unspecified""",,,,4455765,"""Sedan""","""Sedan""",,,
"""2022-03-26T00:…","""11:45""",,,,,,"""QUEENSBORO BRI…",,,1,0,0,0,0,0,1,0,"""Pavement Slipp…",,,,,4513547,"""Sedan""",,,,


In [63]:
df.schema

{'crash_date': Utf8,
 'crash_time': Utf8,
 'borough': Utf8,
 'zip_code': Int64,
 'latitude': Float64,
 'longitude': Float64,
 'location': Utf8,
 'on_street_name': Utf8,
 'off_street_name': Utf8,
 'cross_street_name': Utf8,
 'number_of_persons_injured': Int64,
 'number_of_persons_killed': Int64,
 'number_of_pedestrians_injured': Int64,
 'number_of_pedestrians_killed': Int64,
 'number_of_cyclist_injured': Int64,
 'number_of_cyclist_killed': Int64,
 'number_of_motorist_injured': Int64,
 'number_of_motorist_killed': Int64,
 'contributing_factor_vehicle_1': Utf8,
 'contributing_factor_vehicle_2': Utf8,
 'contributing_factor_vehicle_3': Utf8,
 'contributing_factor_vehicle_4': Utf8,
 'contributing_factor_vehicle_5': Utf8,
 'collision_id': Int64,
 'vehicle_type_code1': Utf8,
 'vehicle_type_code2': Utf8,
 'vehicle_type_code_3': Utf8,
 'vehicle_type_code_4': Utf8,
 'vehicle_type_code_5': Utf8}

In [64]:
(df.groupby("collision_id")
   .count()
   .filter(pl.col("count") > 1))

collision_id,count
i64,u32


In [65]:
df.crash_date.is_null()

AttributeError: 'DataFrame' object has no attribute 'crash_date'

In [66]:
df["crash_date"].is_null().any()

False

In [67]:
df['crash_date'][0]

'2021-09-11T00:00:00.000'

In [72]:
df = df.with_columns(
            pl.col("crash_date").str.slice(0, length=10).alias("crash_date_str")
      ).with_columns(
            pl.col("crash_date_str").str.strptime(
                pl.Datetime, "%Y-%m-%d", strict=False).alias("crash_date")
)

df.head()

crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5,crash_date_str
datetime[μs],str,str,i64,f64,f64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str
2021-09-11 00:00:00,"""2:39""",,,,,,"""WHITESTONE EXP…","""20 AVENUE""",,2,0,0,0,0,0,2,0,"""Aggressive Dri…","""Unspecified""",,,,4455765,"""Sedan""","""Sedan""",,,,"""2021-09-11"""
2022-03-26 00:00:00,"""11:45""",,,,,,"""QUEENSBORO BRI…",,,1,0,0,0,0,0,1,0,"""Pavement Slipp…",,,,,4513547,"""Sedan""",,,,,"""2022-03-26"""
2022-06-29 00:00:00,"""6:55""",,,,,,"""THROGS NECK BR…",,,0,0,0,0,0,0,0,0,"""Following Too …","""Unspecified""",,,,4541903,"""Sedan""","""Pick-up Truck""",,,,"""2022-06-29"""
2021-09-11 00:00:00,"""9:35""","""BROOKLYN""",11208.0,40.667202,-73.8665,""" , (40.66720…",,,"""1211 LORI…",0,0,0,0,0,0,0,0,"""Unspecified""",,,,,4456314,"""Sedan""",,,,,"""2021-09-11"""
2021-12-14 00:00:00,"""8:13""","""BROOKLYN""",11233.0,40.683304,-73.917274,""" , (40.68330…","""SARATOGA AVENU…","""DECATUR STREET…",,0,0,0,0,0,0,0,0,,,,,,4486609,,,,,,"""2021-12-14"""


In [73]:
df.groupby("borough").count()

borough,count
str,u32
"""QUEENS""",154
"""BRONX""",107
,367
"""BROOKLYN""",247
"""MANHATTAN""",98
"""STATEN ISLAND""",27


In [74]:
nn_df = df.filter(pl.col("borough").is_not_null())

In [75]:
df.filter(pl.col("borough").is_not_null()).select("borough").unique()

borough
str
"""BROOKLYN"""
"""MANHATTAN"""
"""QUEENS"""
"""BRONX"""
"""STATEN ISLAND"""


In [76]:
borough_df = pl.DataFrame({
                "borough": ["BROOKLYN", "BRONX", "MANHATTAN", "STATEN ISLAND", "QUEENS"],
                "population": [2590516, 1379946, 1596273, 2278029, 378977],
                "area":[179.7, 109.2, 58.68, 281.6, 149.0]
})

In [77]:
(df.filter(pl.col("borough").is_not_null())
   .select(["borough", "number_of_persons_injured"])
   .groupby("borough")
   .sum()
   .join(borough_df, on=["borough"])
   .select([
       "borough", 
       (pl.col("number_of_persons_injured") / pl.col("population")).alias("injuries_per_population")
   ])
)

borough,injuries_per_population
str,f64
"""BROOKLYN""",4.5e-05
"""BRONX""",3.3e-05
"""MANHATTAN""",2.5e-05
"""STATEN ISLAND""",7e-06
"""QUEENS""",0.000193


In [120]:
ctx = pl.SQLContext(crashes=df)

In [121]:
new_df = ctx.execute("""
    SELECT
        borough,
        crash_date AS day,
        SUM(number_of_persons_injured)
    FROM 
        crashes
    WHERE 
        borough IS NOT NULL
    GROUP BY 
        borough, crash_date
""", eager=False)

In [125]:
ctx = ctx.register("daily_crashes", new_df)

In [127]:
ctx.tables()

['crashes', 'daily_crashes']

In [164]:
daily_df = ctx.execute("select * from daily_crashes")

In [183]:
print(daily_df)

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

 SELECT [col("borough"), col("day"), col("number_of_persons_injured")] FROM
   SELECT [col("borough"), col("crash_date").alias("day"), col("number_of_persons_injured")] FROM
    AGGREGATE
    	[col("number_of_persons_injured").sum()] BY [col("borough"), col("crash_date")] FROM
    	FILTER col("borough").is_not_null() FROMDF ["crash_date", "crash_time", "borough", "zip_code"]; PROJECT */30 COLUMNS; SELECTION: "None"


In [193]:
ctx.execute("""
    SELECT
        borough,
        day,
        number_of_persons_injured,
        LAG(1,number_of_persons_injured) OVER (PARTITION BY borough ORDER BY day) as prior_day_injured
FROM
    daily_crashes
ORDER BY 
    borough,
    day DESC
""", eager=True)

InvalidOperationError: unsupported SQL function: lag

In [179]:
non_lazy_daily_df = daily_df.collect()

EXECUTES ON LAZY DATAFRAMES!

In [180]:
import duckdb

In [181]:
query = duckdb.sql("""
    SELECT
        borough,
        day,
        number_of_persons_injured,
        LAG(1,number_of_persons_injured) OVER (PARTITION BY borough ORDER BY day) as prior_day_injured
FROM
    daily_df
ORDER BY 
    borough,
    day DESC
""")

In [182]:
query

┌───────────────┬─────────────────────┬───────────────────────────┬───────────────────┐
│    borough    │         day         │ number_of_persons_injured │ prior_day_injured │
│    varchar    │      timestamp      │           int64           │       int32       │
├───────────────┼─────────────────────┼───────────────────────────┼───────────────────┤
│ BRONX         │ 2022-04-24 00:00:00 │                         0 │                 1 │
│ BRONX         │ 2022-03-26 00:00:00 │                         7 │                 1 │
│ BRONX         │ 2022-03-25 00:00:00 │                         1 │                 1 │
│ BRONX         │ 2022-03-24 00:00:00 │                         1 │                 1 │
│ BRONX         │ 2022-03-22 00:00:00 │                         1 │                 1 │
│ BRONX         │ 2021-12-14 00:00:00 │                         2 │                 1 │
│ BRONX         │ 2021-12-11 00:00:00 │                         1 │                 1 │
│ BRONX         │ 2021-12-10 00:

In [170]:
query.pl()

borough,day,number_of_persons_injured,prior_day_injured
str,datetime[μs],i64,i32
"""BRONX""",2022-04-24 00:00:00,0,1
"""BRONX""",2022-03-26 00:00:00,7,1
"""BRONX""",2022-03-25 00:00:00,1,1
"""BRONX""",2022-03-24 00:00:00,1,1
"""BRONX""",2022-03-22 00:00:00,1,1
"""BRONX""",2021-12-14 00:00:00,2,1
"""BRONX""",2021-12-11 00:00:00,1,1
"""BRONX""",2021-12-10 00:00:00,1,1
"""BRONX""",2021-09-11 00:00:00,6,1
"""BRONX""",2021-09-10 00:00:00,1,1


Cummulative Sum!

In [188]:
ctx.execute("""
    SELECT
        borough,
        day,
        number_of_persons_injured,
        SUM(number_of_persons_injured) OVER (
                            PARTITION BY borough 
                            ORDER BY day ASC) AS cumulative_injuried
    FROM 
        daily_crashes
    ORDER BY
        borough,
        day DESC
""", eager=True).head(5)

borough,day,number_of_persons_injured,cumulative_injuried
str,datetime[μs],i64,i64
"""BRONX""",2022-04-24 00:00:00,0,45
"""BRONX""",2022-03-26 00:00:00,7,45
"""BRONX""",2022-03-25 00:00:00,1,45
"""BRONX""",2022-03-24 00:00:00,1,45
"""BRONX""",2022-03-22 00:00:00,1,45


WRONG!

In [191]:
query = duckdb.sql("""
    SELECT
        borough,
        day,
        number_of_persons_injured,
        SUM(number_of_persons_injured) OVER (
                            PARTITION BY borough 
                            ORDER BY day ASC) AS cumulative_injuried
    FROM 
        daily_df
    ORDER BY
        borough,
        day ASC
""")

In [192]:
query

┌───────────────┬─────────────────────┬───────────────────────────┬─────────────────────┐
│    borough    │         day         │ number_of_persons_injured │ cumulative_injuried │
│    varchar    │      timestamp      │           int64           │       int128        │
├───────────────┼─────────────────────┼───────────────────────────┼─────────────────────┤
│ BRONX         │ 2021-02-26 00:00:00 │                         0 │                   0 │
│ BRONX         │ 2021-04-06 00:00:00 │                         0 │                   0 │
│ BRONX         │ 2021-04-08 00:00:00 │                         0 │                   0 │
│ BRONX         │ 2021-04-10 00:00:00 │                         4 │                   4 │
│ BRONX         │ 2021-04-11 00:00:00 │                         0 │                   4 │
│ BRONX         │ 2021-04-12 00:00:00 │                         0 │                   4 │
│ BRONX         │ 2021-04-13 00:00:00 │                         3 │                   7 │
│ BRONX   

In [None]:
new_df = df.join(borough_df, on=["borough"], how="left")

In [10]:
# new_df.write_parquet("s3://harmonskis/nyc_accidents.parquet")