In [1]:
import polars as pl

In [2]:
# Import incidents and format columns
df = pl.read_csv(
    "./data/incident_dump.csv",
).with_columns(
    pl.col("reported")
    .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%z")
    .dt.convert_time_zone("America/Chicago"),
    pl.col("reported_date").str.to_date("%Y-%m-%d"),
    pl.col("validated_location").str.split(",").cast(pl.List(pl.Float64)),
)

In [3]:
df.groupby(["incident"]).agg(pl.count()).sort("count", descending=True)

incident,count
str,u32
"""Theft""",1553
"""Information""",883
"""Found Property…",761
"""Lost Property""",501
"""Information / …",431
"""Medical Call""",367
"""Battery""",342
"""Liquor Law Vio…",337
"""Information / …",329
"""Mental Health …",294


In [4]:
excluded_list = [
    "Fondling",
    "Medical Call",
    "Luring a Minor",
    "Lost Property",
    "Stalking",
    "Sexual Assault",
    "Dating",
    "Stalking",
    "Domestic",
    "Sex",
    "Found Property",
    "Mental Health",
    "Harassment by Electronic Means",
    "Well-Being",
    "Threatening Phone Call",
    "Medical Transport",
    "Warrant",
    "Lost Wallet",
    "Fire Alarm",
    "Chemical Spill",
    "Suspicious Mail",
    "Eavesdropping",
]
df = df.filter(~pl.col("incident").str.contains("|".join(excluded_list)))
df.groupby(["incident"]).agg(pl.count()).sort("count", descending=True)

incident,count
str,u32
"""Theft""",1553
"""Information""",883
"""Information / …",431
"""Battery""",342
"""Liquor Law Vio…",337
"""Information / …",329
"""Information / …",211
"""Information / …",209
"""Found Narcotic…",187
"""Injured Person…",186


In [5]:
def list_to_parsed_set(unparsed_list):
    parsed_set = set()
    for element in unparsed_list:
        if "/" in element:
            for p in element.split("/"):
                fmt_element = p.strip()
                if p:
                    parsed_set.add(fmt_element.title())
        else:
            fmt_element = element.strip()
            parsed_set.add(fmt_element.title())
    return parsed_set

In [6]:
print(len(list_to_parsed_set(df["incident"].to_list())))

278
