# Block + Street Name to Latitude and Longitude

Using Geopy and Nominatim, we use our collection of Blocks and Streets and determine the latitude and longitude.

Note: Nominatim is rate limited to 1 request, per second, so depending on the incoming data size, the processing may take a while.

In [87]:
import os
import time

from geopy.geocoders import Nominatim
import numpy as np
import pandas as pd

In [88]:
df = pd.read_csv("../data/cleaned_data/parking_tickets.csv")

# only keep the block and street columns
columns = set(df.columns)
columns.remove("Block")
columns.remove("Street")
df = df.drop(columns, axis=1)

# remove duplicates
df = df.drop_duplicates()
df.head()

Unnamed: 0,Block,Street
0,800,RICHARDS ST
1,300,E 8TH AVE
2,500,E 19TH AVE
3,500,E 17TH AVE
4,100,E 20TH AVE


In [None]:
# hacky work-around: Nominatim can't provide a lat/lon for these places
# drop_rows = [
#     (1200, "LAMEYS MILL RD"),
#     (1300, "LAMEYS MILL RD"),
#     (1400, "LAMEYS MILL RD"),
# ]

df = df.drop(df[(df["Street"].str.endswith("P"))].index) # examples: BIRCH WALK P, SPRINGTREE 
df = df.drop(df[(df["Street"] == "LAMEYS MILL RD")].index)
df = df.drop(df[(df["Street"] == "BONSAI ST")].index)
df = df.drop(df[(df["Street"] == "MENCHION MEWS")].index)
df = df.drop(df[(df["Street"] == "THELLAIWHALTUN AVE")].index)

# for block, street in drop_rows:
#     df = df.drop(df[(df["Block"] == block) & (df["Street"] == street)].index)

In [90]:
geolocator = Nominatim(user_agent="cmpt_353_project_1")

counter = [0]
total_rows = df.shape[0]


In [91]:
# track the progress of the computation
progress_file = "progress.txt"
if os.path.exists(progress_file):
    with open(progress_file, "r") as f:
        counter[0] = int(f.read().strip())  # Read the last processed index

In [92]:
chunk_size = 25
# Create chunks starting from counter[0]
chunks = [df.iloc[i:i + chunk_size] for i in range(counter[0], total_rows, chunk_size)]

# Sanity check
total_chunked_size = sum(chunk_df.shape[0] for chunk_df in chunks)
expected_size = total_rows - counter[0] 
assert total_chunked_size == expected_size, \
    f"Mismatch: {total_chunked_size} != {expected_size} (expected {expected_size})"



In [93]:
cached_df = pd.read_csv("../data/cleaned_data/cache_block_street_with_lat_lon.csv")
cached_df = cached_df.drop_duplicates()

In [94]:
def get_lat_lon(block, street, counter):
    address = f"{block} {street}, Vancouver, BC"
    print(f"Attempting to find lat/lon for {address}")
    # search cached_df
    filtered_df = cached_df.loc[(df["Block"] == block) & (cached_df["Street"] == "street")]
    if not filtered_df.empty:
        print(f"{counter[0]}/{total_rows} - {address}: {location.latitude}, {location.longitude}  | ~{((total_rows - counter[0]) / 60):.2f} minutes remaining")
        counter[0] += 1
        row = df.iloc[0]
        return row["lat"], row["lon"]
    
    retries = 5
    time.sleep(0.9)  # Rate limited to 1 request per second
    i = 0
    
    while i < retries:
        try:
            location = geolocator.geocode(address, timeout=2)
            
            if location:
                print(f"{counter[0]}/{total_rows} - {address}: {location.latitude}, {location.longitude}  | ~{((total_rows - counter[0]) / 60):.2f} minutes remaining")
                counter[0] += 1
                return location.latitude, location.longitude
        except Exception as e:
            i += 1
            print(f"Error: {e}...trying again for attempt {i} / {retries}")
    
    counter[0] += 1
    return np.nan, np.nan

In [95]:
out_file = "../data/cleaned_data/block_street_with_lat_lon.csv"

In [96]:
for chunk_df in chunks:
    chunk_df = chunk_df.copy()
    chunk_df.loc[:,"lat"], chunk_df.loc[:,"lon"] = zip(*chunk_df.apply(
        lambda row: get_lat_lon(row["Block"], row["Street"], counter), 
        axis=1
    ))
    
    if counter[0] != chunk_size:
        chunk_df.to_csv(out_file, mode="a", header=False, index=False)
    else: # write to a fresh file
        chunk_df.to_csv(out_file, mode="w", header=True, index=False) 
    
    print(f"Wrote chunk {int((counter[0] + 1) / chunk_size)} to {out_file}")
    
    # bookmark progress
    with open(progress_file, "w") as f:
        f.write(str(counter[0]))

Attempting to find lat/lon for 900 W BROADWAY, Vancouver, BC


4000/4131 - 900 W BROADWAY, Vancouver, BC: 49.263649475047416, -123.1389344042681  | ~2.18 minutes remaining
Attempting to find lat/lon for 8500 SHAUGHNESSY ST, Vancouver, BC
4001/4131 - 8500 SHAUGHNESSY ST, Vancouver, BC: 49.21183978978081, -123.12876973011937  | ~2.17 minutes remaining
Attempting to find lat/lon for 700 W 27TH AVE, Vancouver, BC
4002/4131 - 700 W 27TH AVE, Vancouver, BC: 49.24716431798934, -123.12360598755744  | ~2.15 minutes remaining
Attempting to find lat/lon for 3600 SCALES PLACE, Vancouver, BC
4003/4131 - 3600 SCALES PLACE, Vancouver, BC: 49.24058028018964, -123.02507638257093  | ~2.13 minutes remaining
Attempting to find lat/lon for 4000 W 28TH AVE, Vancouver, BC
4004/4131 - 4000 W 28TH AVE, Vancouver, BC: 49.247115907458166, -123.18197003463287  | ~2.12 minutes remaining
Attempting to find lat/lon for 6900 FREMLIN ST, Vancouver, BC
4005/4131 - 6900 FREMLIN ST, Vancouver, BC: 49.20991518931212, -123.12744330100803  | ~2.10 minutes remaining
Attempting to find l

In [97]:

ndf = pd.read_csv("../data/cleaned_data/block_street_with_lat_lon.csv")
print(ndf.shape[0])

ndf = ndf.drop_duplicates()
print(ndf.shape[0])

print(df.shape[0])

merged = pd.merge(df, ndf, on=["Block", "Street"], how="outer")

print(f"Number of rows with NAN: {merged.isna().any(axis=1).sum()}")

merged

4131
4130
4131
Number of rows with NAN: 1


Unnamed: 0,Block,Street,lat,lon
0,0,ABBOTT ST,49.284110,-123.106311
1,0,ALEXANDER ST,49.284079,-123.098285
2,0,ATHLETES WAY,49.271554,-123.106961
3,0,CASSIAR ST,49.248844,-123.031089
4,0,CHESS ST,49.273341,-123.086421
...,...,...,...,...
4126,9300,OAK ST,49.264950,-123.126546
4127,11100,BONSAI ST,49.257334,-123.167749
4128,11100,SCALES PLACE,49.240580,-123.025076
4129,26000,NOOTKA ST,49.249873,-123.041618


In [98]:
print("Finished!")
if os.path.exists(progress_file):
    os.remove(progress_file)

Finished!
