In [26]:
import polars as pl

In [27]:
# Import incidents and format columns
df = pl.read_csv(
    "./data/incident_dump.csv",
).with_columns(
    pl.col("reported")
    .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%z")
    .dt.convert_time_zone("America/Chicago"),
    pl.col("reported_date").str.to_date("%Y-%m-%d"),
    pl.col("validated_location").str.split(",").cast(pl.List(pl.Float64)),
    pl.col("incident")
    .str.replace("Dui", "DUI")
    .str.replace("Uc", "UC")
    .str.replace("Uuw", "Unlawful Use of a Weapon"),
)

In [28]:
df.groupby(["incident"]).agg(pl.count()).sort("count", descending=True)

incident,count
str,u32
"""Theft""",1111
"""Information""",715
"""Found Property…",645
"""Information / …",392
"""Lost Property""",329
"""Medical Call""",327
"""Battery""",304
"""Information / …",278
"""Mental Health …",265
"""Liquor Law Vio…",246


In [29]:
excluded_list = [
    "Fondling",
    "Medical Call",
    "Luring a Minor",
    "Lost Property",
    "Stalking",
    "Sexual Assault",
    "Dating",
    "Stalking",
    "Domestic",
    "Sex",
    "Found Property",
    "Mental Health",
    "Harassment by Electronic Means",
    "Well-Being",
    "Threatening Phone Call",
]
df = df.filter(~pl.col("incident").str.contains("|".join(excluded_list)))
df.groupby(["incident"]).agg(pl.count()).sort("count", descending=True)

incident,count
str,u32
"""Theft""",1111
"""Information""",715
"""Information / …",392
"""Battery""",304
"""Information / …",278
"""Liquor Law Vio…",246
"""Information / …",178
"""Found Narcotic…",168
"""Information / …",152
"""Traffic Crash""",136


In [36]:
def list_to_parsed_set(unparsed_list):
    parsed_set = set()
    for element in unparsed_list:
        if "/" in element:
            for p in element.split("/"):
                fmt_element = p.strip()
                if p != "":
                    parsed_set.add(fmt_element.title())
        else:
            fmt_element = element.strip()
            parsed_set.add(fmt_element.title())
    return parsed_set

In [37]:
print(len(list_to_parsed_set(df["incident"].to_list())))

271
