# Location Data Collector for Canada

This notebook collects hierarchical location data for Canadian provinces and their regions/cities, with exact coordinates and boundaries.

In [1]:
# Install required packages
!pip install geopy pandas tqdm requests



In [2]:
import time
import logging
import os
import json
import requests
from typing import List, Dict, Optional, Tuple
from datetime import datetime

import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
from tqdm.notebook import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [3]:
# Fallback regions data for Canadian provinces
FALLBACK_REGIONS = {
    'Ontario': [
        'Algoma', 'Brant', 'Bruce', 'Chatham-Kent', 'Cochrane', 'Dufferin',
        'Durham', 'Elgin', 'Essex', 'Frontenac', 'Grey', 'Haldimand-Norfolk',
        'Haliburton', 'Halton', 'Hamilton', 'Hastings', 'Huron', 'Kawartha Lakes',
        'Kenora', 'Lambton', 'Lanark', 'Leeds and Grenville', 'Lennox and Addington',
        'Manitoulin', 'Middlesex', 'Muskoka', 'Niagara', 'Nipissing',
        'Northumberland', 'Ottawa', 'Oxford', 'Parry Sound', 'Peel', 'Perth',
        'Peterborough', 'Prescott and Russell', 'Prince Edward', 'Rainy River',
        'Renfrew', 'Simcoe', 'Stormont Dundas Glengarry', 'Sudbury', 'Thunder Bay',
        'Timiskaming', 'Toronto', 'Waterloo', 'Wellington', 'York'
    ],
    'Quebec': [
        'Abitibi-Témiscamingue', 'Bas-Saint-Laurent', 'Capitale-Nationale',
        'Centre-du-Quebec', 'Chaudière-Appalaches', 'Côte-Nord', 'Estrie',
        'Gaspésie', 'Lanaudière', 'Laurentides', 'Laval', 'Mauricie',
        'Montérégie', 'Montreal', 'Nord-du-Quebec', 'Outaouais',
        'Saguenay-Lac-Saint-Jean'
    ],
    'British Columbia': [
        'Alberni-Clayoquot', 'Bulkley-Nechako', 'Capital', 'Cariboo',
        'Central Coast', 'Central Kootenay', 'Central Okanagan',
        'Columbia-Shuswap', 'Comox Valley', 'Cowichan Valley',
        'East Kootenay', 'Fraser Valley', 'Fraser-Fort George',
        'Greater Vancouver', 'Kitimat-Stikine', 'Kootenay Boundary',
        'Mount Waddington', 'Nanaimo', 'North Okanagan',
        'Northern Rockies', 'Okanagan-Similkameen', 'Peace River',
        'Powell River', 'Skeena-Queen Charlotte', 'Squamish-Lillooet',
        'Stikine', 'Strathcona', 'Sunshine Coast', 'Thompson-Nicola'
    ],
    'Alberta': [
        'Calgary', 'Central', 'Edmonton', 'North Central',
        'Northeast', 'Northwest', 'South', 'Wood Buffalo'
    ],
    'Manitoba': [
        'Central', 'Eastman', 'Interlake', 'Northern', 'Parkland',
        'Pembina Valley', 'Westman', 'Winnipeg'
    ],
    'Saskatchewan': [
        'Central', 'East Central', 'Northeast', 'Northwest',
        'Regina', 'Saskatoon', 'Southeast', 'Southwest', 'West Central'
    ],
    'Nova Scotia': [
        'Annapolis Valley', 'Cape Breton', 'Halifax', 'North Shore',
        'South Shore', 'Yarmouth'
    ],
    'New Brunswick': [
        'Chaleur', 'Fundy', 'Miramichi', 'Moncton', 'Restigouche',
        'Saint John', 'Western Valley'
    ],
    'Newfoundland and Labrador': [
        'Avalon Peninsula', 'Central', 'Labrador', 'Western'
    ],
    'Prince Edward Island': [
        'Kings', 'Prince', 'Queens'
    ],
    'Northwest Territories': [
        'Dehcho', 'Inuvik', 'North Slave', 'Sahtu', 'South Slave'
    ],
    'Nunavut': [
        'Baffin', 'Kivalliq', 'Kitikmeot'
    ],
    'Yukon': [
        'Central Yukon', 'Klondike', 'Kluane', 'Northern Yukon',
        'Southern Lakes', 'Watson Lake'
    ]
}

In [4]:
def get_regions_for_province(province: str) -> List[str]:
    """Get regions for a province with multiple data sources and caching."""

    # First try OSM Nominatim API
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        'country': 'Canada',
        'state': province,
        'format': 'json',
        'addressdetails': 1,
        'limit': 100
    }
    headers = {'User-Agent': 'CanadaLocationCollector/1.0'}

    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()

        regions = set()
        for item in data:
            if 'address' in item:
                addr = item['address']
                # Try different administrative levels
                region = (addr.get('county') or addr.get('district') or
                         addr.get('region') or addr.get('state_district'))
                if region:
                    regions.add(region)

        # If we found regions, return them
        if regions:
            return list(regions)

    except Exception as e:
        logging.warning(f"Error getting regions from API for {province}: {str(e)}")

    # If API failed or returned no results, use fallback data
    if province in FALLBACK_REGIONS:
        logging.info(f"Using fallback regions for {province}")
        return FALLBACK_REGIONS[province]

    logging.error(f"No regions found for {province} in any data source")
    return []

In [5]:
class CanadaLocationCollector:
    def __init__(self, user_agent: str = 'CanadaLocationCollector/1.0'):
        self.geolocator = Nominatim(user_agent=user_agent, timeout=30)
        self.last_request_time = 0
        self.min_cities = 6
        self.max_cities = 33

    def _respect_rate_limit(self):
        """Ensure we respect the rate limit of 1 request per second."""
        current_time = time.time()
        time_since_last = current_time - self.last_request_time
        if time_since_last < 1.0:
            time.sleep(1.0 - time_since_last)
        self.last_request_time = time.time()

    def _make_request_with_retry(self, query_func, max_retries=3):
        """Make a request with retry logic."""
        for attempt in range(max_retries):
            try:
                self._respect_rate_limit()
                return query_func()
            except Exception as e:
                if attempt == max_retries - 1:
                    logging.error(f"Error after {max_retries} attempts: {str(e)}")
                    return None
                time.sleep(2 ** attempt)  # Exponential backoff

    def get_city_details(self, city: str, region: str, province: str) -> Optional[Dict]:
        """Get detailed information for a city including postal code and address."""
        def query_city():
            # Try multiple queries to find the best result
            queries = [
                f"city hall {city}, {region}, {province}, Canada",
                f"municipal office {city}, {region}, {province}, Canada",
                f"town hall {city}, {region}, {province}, Canada",
                f"{city} centre, {region}, {province}, Canada",
                f"{city}, {region}, {province}, Canada"
            ]

            for query in queries:
                location = self.geolocator.geocode(
                    query,
                    exactly_one=True,
                    addressdetails=True
                )
                if location and location.raw.get('address', {}).get('postcode'):
                    return location
                time.sleep(1)
            return None

        location = self._make_request_with_retry(query_city)
        if location and location.raw.get('address'):
            addr = location.raw['address']
            bbox = location.raw.get('boundingbox', [])

            # Get the most detailed address possible
            address_parts = []
            if addr.get('house_number'):
                address_parts.append(addr['house_number'])
            if addr.get('road'):
                address_parts.append(addr['road'])
            elif addr.get('pedestrian'):
                address_parts.append(addr['pedestrian'])

            address = ' '.join(address_parts) if address_parts else 'City Hall'

            return {
                'ProvinceName': province,
                'RegionName': region,
                'CityName': city,
                'Address': address,
                'PostalCode': addr.get('postcode', ''),
                'Latitude': location.latitude,
                'Longitude': location.longitude,
                'Box1': bbox[0] if len(bbox) > 0 else '',
                'Box2': bbox[1] if len(bbox) > 1 else '',
                'Box3': bbox[2] if len(bbox) > 2 else '',
                'Box4': bbox[3] if len(bbox) > 3 else ''
            }
        return None

    def get_cities_in_region(self, region: str, province: str) -> List[Dict]:
        """Get cities for a specific region."""
        cities_data = []

        # Try different queries to get more cities
        queries = [
            f"cities in {region}, {province}, Canada",
            f"towns in {region}, {province}, Canada",
            f"villages in {region}, {province}, Canada",
            f"municipalities in {region}, {province}, Canada"
        ]

        for query in queries:
            if len(cities_data) >= self.max_cities:
                break

            def query_cities():
                return self.geolocator.geocode(
                    query,
                    exactly_one=False,
                    addressdetails=True,
                    limit=self.max_cities
                )

            locations = self._make_request_with_retry(query_cities)
            if locations:
                for loc in locations:
                    if len(cities_data) >= self.max_cities:
                        break

                    if loc and loc.raw.get('address'):
                        addr = loc.raw['address']
                        city = addr.get('city') or addr.get('town') or addr.get('village')
                        if city:
                            city_data = self.get_city_details(city, region, province)
                            if city_data:
                                cities_data.append(city_data)

            time.sleep(1)  # Delay between queries

        return cities_data

    def collect_province_data(self, province: str) -> pd.DataFrame:
        """Collect data for a specific province."""
        province_data = []

        # Get regions for this province
        regions = get_regions_for_province(province)
        if not regions:
            logging.error(f"No regions found for {province}")
            return pd.DataFrame()

        for region in tqdm(regions, desc=f"Processing {province} regions", leave=False):
            cities = self.get_cities_in_region(region, province)
            if len(cities) >= self.min_cities:  # Only include regions with enough cities
                province_data.extend(cities)
            time.sleep(1)  # Extra delay between regions

        return pd.DataFrame(province_data)

    def collect_all_provinces(self, output_dir: str = 'canada_data'):
        """Collect data for all provinces and save to separate CSV files."""
        os.makedirs(output_dir, exist_ok=True)

        provinces = list(FALLBACK_REGIONS.keys())

        for province in tqdm(provinces, desc="Processing provinces"):
            df = self.collect_province_data(province)
            if not df.empty:
                # Ensure exact column names and order
                columns = [
                    'ProvinceName', 'RegionName', 'CityName', 'Address',
                    'PostalCode', 'Latitude', 'Longitude',
                    'Box1', 'Box2', 'Box3', 'Box4'
                ]
                df = df[columns]  # Reorder columns

                output_file = os.path.join(output_dir, f"{province.replace(' ', '_')}_locations.csv")
                df.to_csv(output_file, index=False)
                print(f"Saved {len(df)} locations for {province} to {output_file}")

            time.sleep(2)  # Extra delay between provinces

## Run the Collector

Now let's collect data for all Canadian provinces:

In [6]:
# Initialize collector
collector = CanadaLocationCollector()

# Collect data for all provinces
collector.collect_all_provinces()

print("\nData collection complete! Check the 'canada_data' folder for CSV files.")

Processing provinces:   0%|          | 0/13 [00:00<?, ?it/s]

Processing Ontario regions:   0%|          | 0/48 [00:00<?, ?it/s]

Saved 461 locations for Ontario to canada_data/Ontario_locations.csv


Processing Quebec regions:   0%|          | 0/17 [00:00<?, ?it/s]

Saved 256 locations for Quebec to canada_data/Quebec_locations.csv


Processing British Columbia regions:   0%|          | 0/29 [00:00<?, ?it/s]

Saved 107 locations for British Columbia to canada_data/British_Columbia_locations.csv


Processing Alberta regions:   0%|          | 0/8 [00:00<?, ?it/s]

Processing Manitoba regions:   0%|          | 0/8 [00:00<?, ?it/s]

Processing Saskatchewan regions:   0%|          | 0/9 [00:00<?, ?it/s]

Processing Nova Scotia regions:   0%|          | 0/6 [00:00<?, ?it/s]

Saved 31 locations for Nova Scotia to canada_data/Nova_Scotia_locations.csv


Processing New Brunswick regions:   0%|          | 0/7 [00:00<?, ?it/s]

Saved 8 locations for New Brunswick to canada_data/New_Brunswick_locations.csv


Processing Newfoundland and Labrador regions:   0%|          | 0/4 [00:00<?, ?it/s]

Processing Prince Edward Island regions:   0%|          | 0/3 [00:00<?, ?it/s]

Saved 15 locations for Prince Edward Island to canada_data/Prince_Edward_Island_locations.csv


Processing Northwest Territories regions:   0%|          | 0/5 [00:00<?, ?it/s]

Processing Nunavut regions:   0%|          | 0/3 [00:00<?, ?it/s]

Processing Yukon regions:   0%|          | 0/6 [00:00<?, ?it/s]


Data collection complete! Check the 'canada_data' folder for CSV files.
