# For convenience: Scrape StationIDs to lookup cities
Simple Python script for scraping StationIDs from Environment Canada using Beautiful Soup.

The stationIDs are provided by province in this Environment Canada [page](http://climate.weather.gc.ca/historical_data/search_historic_data_e.html). Environment Canada limits the number of rows in the search results to 100 entries. This script loops through all pages and grabs the StationID, Station Name, Intervals and Year Range.

In [1]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil import rrule
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
import re

In [2]:
# We'll need `fuzzywuzzy` to look up weather stations later
# Run "!pip install fuzzywuzzy --user" if you get an error

# !pip install fuzzywuzzy --user
from fuzzywuzzy import fuzz

## Parsing the Environment Canada page with Beautiful Soup

In [7]:
def parse_station_id(soup_frames, province, data_folder):
    # Empty list to store the station data
    station_data = []

    for soup in soup_frames:  # For each soup
        forms = soup.findAll(
            "form", {"id": re.compile('stnRequest*')}
        )  # We find the forms with the stnRequest* ID using regex
        for form in forms:
            try:
                # The stationID is a child of the form
                station = form.find("input", {"name": "StationID"})['value']

                # The station name is a sibling of the input element named lstProvince
                name = (
                    form.find("input", {"name": "lstProvince"})
                    .find_next_siblings("div")[0]
                    .text
                )

                # The intervals are listed as children in a 'select' tag named timeframe
                timeframes = form.find("select", {"name": "timeframe"}).findChildren()
                intervals = [t.text for t in timeframes]

                # We can find the min and max year of this station using the first and last child
                years = form.find("select", {"name": "Year"}).findChildren()
                min_year = years[0].text
                max_year = years[-1].text

                # Store the data in an array
                data = [station, name, intervals, min_year, max_year, province]
                station_data.append(data)
            except:
                pass

    # Create a pandas dataframe using the collected data and give it the appropriate column names
    stations_df = pd.DataFrame(
        station_data,
        columns=[
            'StationID',
            'Name',
            'Intervals',
            'Year Start',
            'Year End',
            'Province',
        ],
    )
    # stations_df.head()
    stations_df.to_csv(f'{data_folder}stations_{province}.csv', index=False)

In [8]:
# Specify Parameters
provinces = [
    "AB",
    "BC",
    "MB",
    "NB",
    "NL",
    "NT",
    "NS",
    "NU",
    "ON",
    "PE",
    "QC",
    "SK",
    "YT",
]  # Province list
max_pages = [
    4,
    4,
    2,
    1,
    1,
    1,
    1,
    2,
    4,
    1,
    4,
    2,
    1,
]  #  Number of pages knowing that each display 100 rows
nb_stations = [
    319,
    367,
    107,
    40,
    95,
    99,
    67,
    124,
    308,
    17,
    332,
    117,
    48,
]  # number of stations since 2014
start_year = "2014"  # I want the results to go back to at least 2006 or earlier
data_folder = 'data/stations/'


for idx, province in enumerate(provinces):
    # Store each page in a list and parse them later
    soup_frames = []
    for i in range(max_pages[idx]):
        startRow = 1 + i * 100
        print(f'Downloading Page: {i} for province {province}')

        base_url = "http://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?"
        queryProvince = (
            f"searchType=stnProv&timeframe=1&lstProvince={province}&optLimit=yearRange&"
        )
        queryYear = f"StartYear={start_year}&EndYear=2024&Year=2024&Month=7&Day=20&selRowPerPage=100&txtCentralLatMin=0&txtCentralLatSec=0&txtCentralLongMin=0&txtCentralLongSec=0&"
        queryStartRow = f"startRow={startRow}"

        response = requests.get(
            base_url + queryProvince + queryYear + queryStartRow
        )  # Using requests to read the HTML source
        soup = BeautifulSoup(response.text, 'html.parser')  # Parse with Beautiful Soup
        soup_frames.append(soup)
    parse_station_id(soup_frames, province, data_folder)

Downloading Page: 0 for province AB
Downloading Page: 1 for province AB
Downloading Page: 2 for province AB
Downloading Page: 3 for province AB
Downloading Page: 0 for province BC
Downloading Page: 1 for province BC
Downloading Page: 2 for province BC
Downloading Page: 3 for province BC
Downloading Page: 0 for province MB
Downloading Page: 1 for province MB
Downloading Page: 0 for province NB
Downloading Page: 0 for province NL
Downloading Page: 0 for province NT
Downloading Page: 0 for province NS
Downloading Page: 0 for province NU
Downloading Page: 1 for province NU
Downloading Page: 0 for province ON
Downloading Page: 1 for province ON
Downloading Page: 2 for province ON
Downloading Page: 3 for province ON
Downloading Page: 0 for province PE
Downloading Page: 0 for province QC
Downloading Page: 1 for province QC
Downloading Page: 2 for province QC
Downloading Page: 3 for province QC
Downloading Page: 0 for province SK
Downloading Page: 1 for province SK
Downloading Page: 0 for prov

Combine Québec and Ontario data cause we'll use them for our project

In [9]:
from typing import List

dfs: List[pd.DataFrame] = []
dfs.append(pd.read_csv(f'{data_folder}stations_QC.csv'))
dfs.append(pd.read_csv(f'{data_folder}stations_ON.csv'))

df = pd.concat(dfs)
df.to_csv(f'{data_folder}stations_QC_ON.csv', index=False)

## Select the stations currently active (in 2024) and keep only the stations with later 'Start Year'

In [11]:
stations_df = pd.read_csv(f'{data_folder}stations_QC_ON.csv')
stations_df_2024 = stations_df.loc[stations_df['Year End'] == 2024]

stations_df_2024.head()

Unnamed: 0,StationID,Name,Intervals,Year Start,Year End,Province
0,54067,AKULIVIK A,['Hourly'],2015,2024,QC
1,54068,AKULIVIK A,"['Hourly', 'Daily']",2018,2024,QC
5,5310,ARTHABASKA,"['Daily', 'Monthly']",1969,2024,QC
6,5575,ARUNDEL,"['Daily', 'Monthly']",1963,2024,QC
8,54038,AUPALUK A,['Hourly'],2015,2024,QC


In [12]:
stations_df_2024.shape

(413, 6)

In [26]:
duplicate_stations_name = stations_df_2024[
    stations_df_2024.duplicated('Name', keep=False) == True
]
single_stations_name = stations_df_2024[
    stations_df_2024.duplicated('Name', keep=False) == False
]

In [27]:
idx = duplicate_stations_name.groupby('Name')['Year Start'].idxmax()
max_year = duplicate_stations_name.loc[idx]

In [28]:
dfs: List[pd.DataFrame] = []
dfs.append(single_stations_name)
dfs.append(max_year)

df = pd.concat(dfs)
df.to_csv(f'{data_folder}clean_stations_QC_ON_2024.csv', index=False)

In [31]:
SATATION_ID = df['StationID'].unique()

## Select just the Ontario data

In [35]:
df = df[df['Province'] == 'ON']
df.to_csv(f'{data_folder}clean_stations_ON_2024.csv', index=False)