In [13]:
import urllib.parse
import bs4
import requests

In [17]:
# Get the page with the list of doctors
req = requests.get("http://www.bcdoctordirectory.ca/doctor-search.html")

if req.status_code == 200:
    index = req.content.decode("utf-8")

    index_soup = bs4.BeautifulSoup(index, "html5lib")

    places = {}

    for tag in index_soup.find(attrs={"id": "footer"}).descendants:
        if tag.name == "li":
            places[tag.string.strip()] = tag.a.attrs["href"]
else:
    print("failed")
    print(req.status_code)

In [69]:
def get_doctors(page_soup):
    """ Given a page, return a list of doctors found and a link to the
    next page if any """
    doctors = []
    next_path = ""
    for tag in page_soup.find(attrs={"class": "newsCol"}).children:
        if tag is not None and tag.name == "a":
            if tag.string is not None:
                if "More Doctors" in tag.string:
                    next_path = tag.attrs["href"]
                elif "Review" in tag.string:
                    pass
                else:
                    doctors.append((tag.string, tag.attrs["href"]))
    return doctors, next_path

In [73]:
doctors_by_location = {}

for place, path in places.items():
    print(place)
    finished = False
    page = 1
    while not finished:
        #print("  page {}".format(page))
        req = requests.get(urllib.parse.urljoin("http://www.bcdoctordirectory.ca/", path))

        if req.status_code == 200:
            page_soup = bs4.BeautifulSoup(req.content.decode("utf-8"), "html5lib")
            doctors, path = get_doctors(page_soup)
            if place in doctors_by_location:
                doctors_by_location[place].extend(doctors)
            else:
                doctors_by_location[place] = doctors

            if path == "":
                finished = True
            else:
                page += 1
        else:
            print("=========================")
            print(res.status_code)
            print("=========================")
            finished = True

Campbell River,
Fanny Bay
Nelson
Bowen Island
Kamloops
Aiyansh
Port Moody
Masset
Creston
Osoyoos
Mackenzie
Hope
Whistler
Ladner
Dawson Creek
Pemberton
Sparwood
Langley
Trail
Fruitvale
Oliver
Denman Island
Fernie
Agassiz
Sechelt
Gold River
Penticton
Port Alice
Keremeos
Telkwa
Lytton
Madeira Park
Alberni
Sayward
Tatlayoko Lake
Errington
Invermere
Aldergrove
Duncan
Cassidy
Pender Island
Hornby Island
Cranbrook
Stewart
Smithers
North Vancouver
Port McNeill
Salmon Arm
Kaslo
Nakusp
Castlegar
Surrey
Enderby
Highlands
Christina Lake
Egmont
Summerland
West Vancouver
Delta
Hagensborg
Vancouver
North Delta
Waglisla
Princeton
Cumberland
Kimberley
Rossland
Sardis
Coola
Sorrento
Williams Lake
Fraser Lake
Courtenay
Abbotsford,
Ucluelet
Brentwood Bay
Queen Charlotte
Elkford
Chetwynd
Shawnigan Lake
Victoria
Greenwood
Blind Bay
Okanagan Falls
Sidney
Kootenay Bay
Gabriola
100 Mile House
Powell River
Dease Lake
Barriere
Saanichton
Whaletown
Vanderhoof
Queen Charlotte City
Houston
Salmo
West Kelowna
Chilli

In [74]:
len(doctors_by_location["Vancouver"])

3006

In [75]:
import json
with open("doctors.json", "w") as f:
    json.dump(doctors_by_location, f)

## Join with geography

In [3]:
import karta
import pandas
import json

with open("doctors.json", "r") as f:
    doctors_by_location = json.load(f)

# Read in a dataset with populated places in Canada
communities = karta.read_shapefile("/home/natw/Documents/population-map/cgn_populated_places_points.shp")

In [18]:
mp = karta.vector.multipart_from_singleparts(communities)

df = pandas.DataFrame(dict(name=mp.d["GEONAME"],
                           type=mp.d["GENERIC"],
                           province=mp.d["PROV_TERR"]))

xy = mp.get_vertices(crs=karta.crs.LonLatWGS84, drop_z=True)
df["lon"] = xy[:,0]
df["lat"] = xy[:,1]

In [19]:
df_bc = df.loc[df.province == "British Columbia", :]
grp = df_bc.groupby("name").first()
grp["name"] = grp.index

In [20]:
df_bc.head()

Unnamed: 0,name,province,type,lon,lat
1,'Ksan,British Columbia,Locality,-127.679722,55.249444
2,100 Mile House,British Columbia,District Municipality,-121.295556,51.642778
3,105 Mile House,British Columbia,Locality,-121.31667,51.7
4,108 Mile Ranch,British Columbia,Community,-121.35,51.75
6,111 Mile House,British Columbia,Locality,-121.38333,51.766667


In [93]:
town = []
name = []
link = []

for loc, doctors in doctors_by_location.items():
    for doctor, url in doctors:
        town.append(loc.strip(","))
        name.append(doctor)
        link.append(url)

doctor_df = pandas.DataFrame(dict(name=town, doctor_name=name, urlpath=link))

# fix spelling differences and mistakes in the source data
doctor_df.loc[doctor_df.name == "Fort St James", "name"] = "Fort St. James"
doctor_df.loc[doctor_df.name == "Fort St John", "name"] = "Fort St. John"
doctor_df.loc[doctor_df.name == "Port  Hardy", "name"] = "Port Hardy"
doctor_df.loc[doctor_df.name == "Mayne", "name"] = "Mayne Island"
doctor_df.loc[doctor_df.name == "W Vancouver", "name"] = "West Vancouver"
doctor_df.loc[doctor_df.name == "Coola", "name"] = "Bella Coola"
doctor_df.loc[doctor_df.name == "Bella", "name"] = "Bella Coola"
doctor_df.loc[doctor_df.name == "Salt Spring Island", "name"] = "Long Harbour" # close enough
doctor_df.loc[doctor_df.name == "West Kelowna", "name"] = "District of West Kelowna"
doctor_df.loc[doctor_df.name == "Campbell", "name"] = "Nanaimo" # this was an error in the data
doctor_df.loc[doctor_df.name == "Cowichan", "name"] = "Lake Cowichan"
doctor_df.loc[doctor_df.name == "Queen Charlotte City", "name"] = "Queen Charlotte"
doctor_df.loc[doctor_df.name == "Landing", "name"] = "Mansons Landing"
doctor_df.loc[doctor_df.name == "Ranch", "name"] = "108 Mile Ranch"

In [100]:
joined = pandas.merge(doctor_df, grp, how="left", on="name")

joined.to_json("doctors_with_location.json")

joined_success = joined.dropna()

mp = karta.vector.Multipoint(zip(joined_success.lon, joined_success.lat),
                             crs=karta.crs.LonLatWGS84, build_index=False)

mp.to_geojson("doctors.geojson")