In [7]:
import polars as pl

In [8]:
# Import incidents and format columns
df = pl.read_csv(
    "./data/incident_dump.csv",
).with_columns(
    pl.col("reported")
    .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%z")
    .dt.convert_time_zone("America/Chicago"),
    pl.col("reported_date").str.to_date("%Y-%m-%d"),
    pl.col("validated_location").str.split(",").cast(pl.List(pl.Float64)),
    pl.col("incident").str.to_lowercase(),
)

In [9]:
df.groupby(["incident"]).agg(pl.count()).sort("count", descending=True)

incident,count
str,u32
"""theft""",1565
"""information""",892
"""found property…",775
"""lost property""",503
"""information / …",439
"""medical call""",372
"""battery""",342
"""liquor law vio…",341
"""information / …",330
"""mental health …",298


In [10]:
excluded_list = [
    "Fondling",
    "Medical Call",
    "Luring a Minor",
    "Lost Property",
    "Stalking",
    "Sexual Assault",
    "Dating",
    "Stalking",
    "Domestic",
    "Sex",
    "Found Property",
    "Mental Health",
    "Harassment by Electronic Means",
    "Well-Being",
    "Threatening Phone Call",
    "Medical Transport",
    "Warrant",
    "Lost Wallet",
    "Fire Alarm",
    "Chemical Spill",
    "Suspicious Mail",
    "Eavesdropping",
]
df = df.filter(~pl.col("incident").str.contains("|".join(excluded_list)))
df.groupby(["incident"]).agg(pl.count()).sort("count", descending=True)

incident,count
str,u32
"""theft""",1565
"""information""",892
"""found property…",775
"""lost property""",503
"""information / …",439
"""medical call""",372
"""battery""",342
"""liquor law vio…",341
"""information / …",330
"""mental health …",298


In [11]:
def list_to_parsed_set(unparsed_list):
    parsed_set = set()
    for element in unparsed_list:
        if "/" in element:
            for p in element.split("/"):
                fmt_element = p.strip().lower()
                if p:
                    parsed_set.add(fmt_element)
        else:
            fmt_element = element.strip().lower()
            parsed_set.add(fmt_element)
    return parsed_set

In [12]:
print(len(list_to_parsed_set(df["incident"].to_list())))

330
