In [181]:
import re
from typing import Tuple

import polars as pl

In [182]:
ORDINAL_STREET_REGEX = r"E\. \d{2}[a-z]{2} St."


def list_to_parsed_set(unparsed_list: [str]):
    unparsed_list.sort()
    parsed_set = set()
    for element in unparsed_list:
        if "/" in element:
            for p in element.split(r" / "):
                fmt_element = p.strip().lower()
                if p:
                    parsed_set.add(fmt_element)
        else:
            fmt_element = element.strip().lower()
            parsed_set.add(fmt_element)
    return parsed_set


def create_street_tuple(street: str, blvd: bool = False) -> Tuple[str, str, str]:
    street_type = "Ave." if not blvd else "Blvd."

    return street, f"S. {street}", f"S. {street} {street_type}"


STREET_CORRECTIONS = [
    create_street_tuple("Blackstone"),
    create_street_tuple("Cottage Grove"),
    create_street_tuple("Cornell"),
    create_street_tuple("Dorchester"),
    create_street_tuple("Drexel"),
    create_street_tuple("East View Park"),
    create_street_tuple("East End"),
    create_street_tuple("Ellis"),
    create_street_tuple("Everett"),
    create_street_tuple("Greenwood"),
    create_street_tuple("Harper"),
    create_street_tuple("Hyde Park", blvd=True),
    create_street_tuple("Ingleside"),
    create_street_tuple("Kenwood"),
    create_street_tuple("Kimbark"),
    create_street_tuple("Lake Park"),
    ("Lake Shore", "S. Lake Shore", "S. Lake Shore Dr."),
    ("Madison Park", "E. Madison Park", "E. Madison Park"),
    create_street_tuple("Maryland"),
    create_street_tuple("Oakenwald"),
    create_street_tuple("Oakwood", blvd=True),
    ("Ridgewood", "S. Ridgewood", "S. Ridgewood Ct."),
    create_street_tuple("Stony Island"),
    create_street_tuple("University"),
    create_street_tuple("Woodlawn"),
]

In [183]:
# Import incidents and format columns
df = pl.read_csv(
    "./data/incident_dump.csv",
).with_columns(
    pl.col("reported")
    .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%z")
    .dt.convert_time_zone("America/Chicago"),
    pl.col("reported_date").str.to_date("%Y-%m-%d"),
    pl.col("validated_location").str.split(",").cast(pl.List(pl.Float64)),
    pl.col("incident").str.to_lowercase(),
)

In [184]:
df.groupby(["incident"]).agg(pl.count()).sort("count", descending=True)

incident,count
str,u32
"""theft""",2975
"""information""",1380
"""found property…",922
"""lost property""",838
"""traffic violat…",534
"""information / …",511
"""liquor law vio…",443
"""medical call""",431
"""battery""",423
"""information / …",357


In [185]:
print(list_to_parsed_set(df["incident"].to_list()))



In [186]:
excluded_list = [
    "Fondling",
    "Medical Call",
    "Luring a Minor",
    "Lost Property",
    "Stalking",
    "Sexual Assault",
    "Dating",
    "Stalking",
    "Domestic",
    "Sex",
    "Found Property",
    "Mental Health",
    "Harassment by Electronic Means",
    "Well-Being",
    "Threatening Phone Call",
    "Medical Transport",
    "Warrant",
    "Lost Wallet",
    "Fire Alarm",
    "Chemical Spill",
    "Suspicious Mail",
    "Eavesdropping",
    "Sex Offense",
    "Sex Offender",
    "Sex Crime",
    "Domestic Aggravated Battery",
    "Dating Violence",
    "Harassing Messages",
]
df = df.filter(~pl.col("incident").str.contains("|".join(excluded_list)))
df.groupby(["incident"]).agg(pl.count()).sort("count", descending=True)

incident,count
str,u32
"""theft""",2975
"""information""",1380
"""found property…",922
"""lost property""",838
"""traffic violat…",534
"""information / …",511
"""liquor law vio…",443
"""medical call""",431
"""battery""",423
"""information / …",357


In [187]:
print(list_to_parsed_set(df["incident"].to_list()))



In [188]:
len(list_to_parsed_set(df["incident"].to_list()))

392

In [189]:
df.filter(pl.col("location").str.contains(" between ")).filter(
    ~pl.col("location").str.contains(" to ")
)

comments,disposition,incident,location,occurred,predicted_incident,reported,reported_date,ucpd_id,validated_address,validated_location
str,str,str,str,str,str,"datetime[μs, America/Chicago]",date,str,str,list[f64]
"""Two unknown su…","""Referred""","""information / …","""S. Dorchester …","""2/9/24 12:10 P…",,2024-02-09 12:18:00 CST,2024-02-09,"""2024-005301""","""between E. 51s…","[41.800894, -87.59668]"
"""Person reports…","""Referred""","""information / …","""S. Blackstone …","""1/25/24 2:40 P…",,2024-02-07 12:04:00 CST,2024-02-07,"""2024-005040""","""and S. Blackst…","[41.7933, -87.592458]"
"""Package taken …","""Referred""","""information / …","""S. Ellis Ave. …","""2/6/24 5:00 PM…",,2024-02-07 14:22:00 CST,2024-02-07,"""2024-005052""","""and S. Ellis A…","[41.795196, -87.586625]"
"""Victim reporte…","""CPD""","""information / …","""S. East End Av…","""10/20/23 3:00 …",,2024-02-07 17:23:00 CST,2024-02-07,"""24-00130""","""between E. 50t…","[41.802257, -87.605867]"
"""Suspect observ…","""CPD""","""information / …","""E. 54th St. be…","""2/4/24 1:20 PM…",,2024-02-04 13:22:00 CST,2024-02-04,"""2024-004615""","""between S. Woo…","[41.793722, -87.5949]"
"""A person walki…","""CPD""","""information / …","""S. Dorchester …","""2/2/24 10:45 P…",,2024-02-02 23:13:00 CST,2024-02-02,"""2024-004492""","""between E. 51s…","[41.800894, -87.59668]"
"""Unknown person…","""CPD""","""information / …","""S. Kenwood Ave…","""1/30/24 7:30 A…",,2024-01-30 20:33:00 CST,2024-01-30,"""2024-004004""","""and South Kenw…","[41.791219, -87.606736]"
"""Unknown person…","""CPD""","""information / …","""S. Kimbark Ave…","""1/30/24 8:15 P…",,2024-01-30 20:46:00 CST,2024-01-30,"""2024-004007""","""between E. 53r…","[41.798003, -87.594163]"
"""Person reports…","""Closed""","""lost property""","""S. Ellis Ave. …","""1/30/24 6:00 P…",,2024-01-30 22:07:00 CST,2024-01-30,"""24-00112""","""between E. 55t…","[41.794767, -87.591675]"
"""Unknown person…","""Referred""","""information / …","""S. Kenwood Ave…","""1/27/24 to 1/2…",,2024-01-29 22:33:00 CST,2024-01-29,"""2024-003903""","""between E. 57t…","[41.789607, -87.596414]"


In [190]:
df.filter(pl.col("location").str.contains(" to "))

comments,disposition,incident,location,occurred,predicted_incident,reported,reported_date,ucpd_id,validated_address,validated_location
str,str,str,str,str,str,"datetime[μs, America/Chicago]",date,str,str,list[f64]
"""Electronic har…","""Open""","""harassment by …","""E. 61st St. St…","""11/30/23 to 12…",,2023-12-28 10:32:00 CST,2023-12-28,"""23-01269""","""between S. Gre…","[41.79506, -87.596557]"
"""Two suitcases …","""Referred""","""information / …","""S. Stony Islan…","""8/30/23 to 8/3…",,2023-08-31 14:06:00 CDT,2023-08-31,"""2023-030412""","""to S. Stony Is…","[41.789607, -87.596414]"
"""Person reporte…","""Closed""","""lost property""","""1330 E. 53rd S…","""8/29/23 11:30 …",,2023-08-29 15:06:00 CDT,2023-08-29,"""23-00810""","""1330 East 53rd…","[41.795231, -87.587464]"
"""Person lost wa…","""Closed""","""lost property""","""1100 E. 57th S…","""4/17/23 10:00 …",,2023-04-24 14:39:00 CDT,2023-04-24,"""23-00386""","""1100 E. 57th S…","[41.796189, -87.588537]"
"""Person reports…","""Closed""","""lost property""","""1101 E. 56th S…","""2/20/23 to 2/2…",,2023-02-21 16:34:00 CST,2023-02-21,"""23-00171""","""1101 E. 56th S…","[41.792279, -87.599954]"
"""Person lost wr…","""Closed""","""lost property""","""929 E.E. 57th …","""12/13/21 8:45 …",,2021-12-16 16:25:00 CST,2021-12-16,"""21-00908""","""to E. 57th and…","[41.791374, -87.601302]"
"""Person reports…","""Closed""","""lost wallet""","""5100 S. Blacks…","""12/11/21 4:02 …",,2021-12-11 16:02:00 CST,2021-12-11,"""21-00889""","""5100 South Bla…","[41.789735, -87.601538]"
"""Person reporte…","""Closed""","""lost property""","""5400 S. Shore …","""7/6/21 1:30 AM…",,2021-07-06 20:38:00 CDT,2021-07-06,"""21-00391""","""5400 S. Shore …","[41.791024, -87.604903]"
"""Person lost IL…","""Closed""","""lost property""","""E. 57th St. an…","""6/16/21 2:30 P…",,2021-06-16 16:51:00 CDT,2021-06-16,"""21-00345""","""and East 57th …","[41.791347, -87.604938]"
"""Person lost ce…","""Closed""","""lost property""","""E. 60th St. an…","""5/21/21 3:40 P…",,2021-05-21 18:35:00 CDT,2021-05-21,"""21-00288""","""and East 60th …","[41.795044, -87.598097]"


In [191]:
STREET_CORRECTIONS_FINAL = [s for _, _, s in STREET_CORRECTIONS]
STREET_CORRECTIONS_FINAL.extend(["S. Shore Dr.", "Midway Plaisance"])


def parse_between_address(addr: str) -> None:
    ordinal_streets = re.findall(r"E\. \d{2}[a-z]{2} \w+", addr)
    ordinal_streets.sort()
    ordinals = list(map(int, re.findall(r"E\. (\d{2})[a-z]{2} \w+", addr)))
    ordinals.sort()
    non_ordinal_streets = [s for s in STREET_CORRECTIONS_FINAL if s in addr]

    if len(ordinals) == 2 and ordinals[1] - ordinals[0] != 1:
        print(addr)
        print(f"Large ordinals: {ordinals}")
    elif len(ordinals) == 2 and len(non_ordinal_streets) == 1:
        if ordinals[1] - ordinals[0] != 1:
            print(addr)
            print(ordinal_streets, non_ordinal_streets)
        return
        # print(f"{ordinals[0]}20 {non_ordinal_streets[0]}")
    elif len(non_ordinal_streets) > 1 and len(ordinals) == 1:
        return
        # print(f"{non_ordinal_streets[0]} and {ordinal_streets[0]}")
        # print(f"{non_ordinal_streets[1]} and {ordinal_streets[0]}")
    else:
        print(addr)
        print(ordinal_streets, non_ordinal_streets)
    print("----")

In [192]:
between_addr = (
    df.filter(pl.col("location").str.contains(" between "))
    .filter(~pl.col("location").str.contains(" to "))["location"]
    .to_list()
)

# Do NOT geocode if it has the word 'to'

# between_addr = df.filter(pl.col("location").str.contains(" to "))["location"].to_list()

In [193]:
for i in range(len(between_addr)):
    parse_between_address(between_addr[i])

S. Ellis Ave. between E. 55th St. and E. 62nd St.
Large ordinals: [55, 62]
----
E. Hyde Park Blvd. between S. Drexel Ave. and S. Ellis Ave.
[] ['S. Drexel Ave.', 'S. Ellis Ave.']
----
E. Madison Park between S. Kimbark Ave. and S. Kenwood Ave.
[] ['S. Kenwood Ave.', 'S. Kimbark Ave.', 'E. Madison Park']
----
E. Madison Park between S. Woodlawn Ave. and S. Dorchester Ave.
[] ['S. Dorchester Ave.', 'E. Madison Park', 'S. Woodlawn Ave.']
----
S. Cottage Grove Ave. between E. 60th St. and E. 53rd St. (CTA Bus)
Large ordinals: [53, 60]
----
E. Hyde Park Blvd. between S. Cornell Ave. and S. East End Ave.
[] ['S. Cornell Ave.', 'S. East End Ave.']
----
E. Hyde Park Blvd. between S. Greenwood Ave. and S. University Ave.
[] ['S. Greenwood Ave.', 'S. University Ave.']
----
S. Lake Shore Dr. between E. 43rd St. and E. 46th St.
Large ordinals: [43, 46]
----
E. Hyde Park Blvd. between S. Greenwood Ave. and S. University Ave.
[] ['S. Greenwood Ave.', 'S. University Ave.']
----
E. Hyde Park Blvd. bet