In [70]:

pip install overpy

Note: you may need to restart the kernel to use updated packages.




In [71]:
# Source of Alert: https://dd.weather.gc.ca/alerts/cap/
# Definition to CWTO, CWVR, etc: https://eccc-msc.github.io/open-data/msc-data/alerts/readme_alerts-datamart_en/

import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import sqlalchemy as sa
from shapely.geometry import Polygon
import overpy

In [72]:
'''
Given an url, find the last folder link in this url and return a link to that
'''
def find_last_folder(url):
    # Send a GET request
    response = requests.get(url)

    # Parse the webpage's content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the links on the webpage
    links = soup.find_all('a')

    # Last link is latest folder
    latest_folder_url = url + links[-1].get('href')

    # for i in range(len(links)):
    #     print(links[i])

    print(latest_folder_url)

    return latest_folder_url





In [73]:

# URL of the webpage
url = 'https://dd.weather.gc.ca/alerts/cap/'


new_url = find_last_folder(url) # Return the link of date folder

# find the last modified folder
sorted_new_url = new_url + '?C=M;O=A'

# new_url = find_last_folder(new_url) # Return the link of hour folder, which contains the cap file

response = requests.get(sorted_new_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the links on the webpage
links = soup.find_all('a')

# Last link is latest folder
new_url = new_url + links[-1].get('href')

print(new_url)

sorted_new_url = new_url + '?C=M;O=A' # Sort by Last Modified Time, so the last file is the newest

# Download the latest file
response = requests.get(sorted_new_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the links on the webpage
links = soup.find_all('a')

# Last link is latest folder
new_url = new_url + links[-1].get('href')

print(new_url)

response = requests.get(new_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the links on the webpage
links = soup.find_all('a')

print(links)

# Last link is latest folder
new_cap = new_url + links[-1].get('href')

print(new_cap)

response = requests.get(new_cap)

# Save the file to disk
with open('latest_file.cap', 'wb') as file:
    file.write(response.content)

https://dd.weather.gc.ca/alerts/cap/20230811/
https://dd.weather.gc.ca/alerts/cap/20230811/CWNT/
https://dd.weather.gc.ca/alerts/cap/20230811/CWNT/04/
[<a href="?C=N;O=D">Name</a>, <a href="?C=M;O=A">Last modified</a>, <a href="?C=S;O=A">Size</a>, <a href="?C=D;O=A">Description</a>, <a href="/alerts/cap/20230811/CWNT/">Parent Directory</a>, <a href="T_WOCN21_C_CWNT_202308110454_1989294651.cap">T_WOCN21_C_CWNT_202308110454_1989294651.cap</a>, <a href="T_WOCN21_C_CWNT_202308110454_3533499021.cap">T_WOCN21_C_CWNT_202308110454_3533499021.cap</a>, <a href="T_WOCN22_C_CWNT_202308110455_1931067247.cap">T_WOCN22_C_CWNT_202308110455_1931067247.cap</a>, <a href="T_WOCN22_C_CWNT_202308110455_3586899473.cap">T_WOCN22_C_CWNT_202308110455_3586899473.cap</a>, <a href="T_WOCN24_C_CWNT_202308110455_0858217755.cap">T_WOCN24_C_CWNT_202308110455_0858217755.cap</a>, <a href="T_WOCN24_C_CWNT_202308110455_2035238103.cap">T_WOCN24_C_CWNT_202308110455_2035238103.cap</a>]
https://dd.weather.gc.ca/alerts/cap/202

In [74]:
tree = ET.parse('latest_file.cap')

In [75]:
root = tree.getroot()


In [76]:
import time

In [77]:
info = root.find('{urn:oasis:names:tc:emergency:cap:1.2}info') # this is hard coded because there are two info, the second one is English version

In [78]:
print(info.find('{urn:oasis:names:tc:emergency:cap:1.2}description').text)


###

Wildfire smoke can be harmful to everyone’s health even at low concentrations. Continue to take actions to protect your health and reduce exposure to smoke.

People with lung disease (such as asthma) or heart disease, older adults, children, pregnant people, and people who work outdoors are at higher risk of experiencing health effects caused by wildfire smoke.

Visit www.airhealth.ca for information on how to reduce your health risk and your personal contribution to pollution levels, as well as for current and forecast AQHI values.

Please continue to monitor alerts and forecasts issued by Environment Canada.



In [79]:
polygon_coords = info.find('{urn:oasis:names:tc:emergency:cap:1.2}area').find('{urn:oasis:names:tc:emergency:cap:1.2}polygon').text

In [80]:
print(polygon_coords)

69.1682,-105.6733 69.2077,-105.6437 69.2785,-105.5217 69.3293,-105.3231 69.3484,-105.0834 69.3339,-104.8519 69.2871,-104.6455 69.2188,-104.5126 69.1378,-104.4622 69.0525,-104.5044 69.0122,-104.5629 68.9745,-104.6509 68.9448,-104.76 68.9257,-104.8746 68.9148,-105.0185 68.9165,-105.1657 68.8552,-105.2968 69.0273,-105.727 69.1682,-105.6733


In [81]:
# Convert the string to coordinates seperated by , and space
pairs = polygon_coords.split()
polygon_coords = [(float(x.split(',')[0]), float(x.split(',')[1])) for x in pairs]
print(polygon_coords)

[(69.1682, -105.6733), (69.2077, -105.6437), (69.2785, -105.5217), (69.3293, -105.3231), (69.3484, -105.0834), (69.3339, -104.8519), (69.2871, -104.6455), (69.2188, -104.5126), (69.1378, -104.4622), (69.0525, -104.5044), (69.0122, -104.5629), (68.9745, -104.6509), (68.9448, -104.76), (68.9257, -104.8746), (68.9148, -105.0185), (68.9165, -105.1657), (68.8552, -105.2968), (69.0273, -105.727), (69.1682, -105.6733)]


In [82]:
polygon = Polygon(polygon_coords)

def get_cities_from_polygon(polygon_coords):
    # Convert the tuple list to a format suitable for Overpass query
    coords_str = " ".join(f"{lat} {lon}" for lat, lon in polygon_coords)

    # Construct the Overpass QL (Query Language) string
    overpass_query = f"""
    [out:json];
    (
      node["place"="city"](poly:"{coords_str}");
      node["place"="town"](poly:"{coords_str}");
    );
    out center;
    """

    api = overpy.Overpass()
    result = api.query(overpass_query)

    city_names = [node.tags["name"] for node in result.nodes]

    return city_names

# Define your polygon
# polygon_coords = [(lat1, lon1), (lat2, lon2), ...]  # Add your coordinates here
city_list = get_cities_from_polygon(polygon_coords)
for city in city_list:
    print(city)


Iqaluktuuttiaq (Cambridge Bay)


In [84]:
# Append all areas to the list, for latter add to the dictionary

areas = info.findall('{urn:oasis:names:tc:emergency:cap:1.2}area')
area_texts = []
for i in areas:
    area_texts.append(i[0].text)


In [85]:
info_dict = {
    "language": info.find('{urn:oasis:names:tc:emergency:cap:1.2}language').text,
    "event": info.find('{urn:oasis:names:tc:emergency:cap:1.2}event').text,
    "responseType": info.find('{urn:oasis:names:tc:emergency:cap:1.2}responseType').text,
    "urgency": info.find('{urn:oasis:names:tc:emergency:cap:1.2}urgency').text,
    "severity": info.find('{urn:oasis:names:tc:emergency:cap:1.2}severity').text,
    "certainty": info.find('{urn:oasis:names:tc:emergency:cap:1.2}certainty').text,
    "audience": info.find('{urn:oasis:names:tc:emergency:cap:1.2}audience').text,
    "effective": info.find('{urn:oasis:names:tc:emergency:cap:1.2}effective').text,
    "expires": info.find('{urn:oasis:names:tc:emergency:cap:1.2}expires').text,
    "senderName": info.find('{urn:oasis:names:tc:emergency:cap:1.2}senderName').text,
    "headline": info.find('{urn:oasis:names:tc:emergency:cap:1.2}headline').text,
    "description": info.find('{urn:oasis:names:tc:emergency:cap:1.2}description').text.strip(),
    "area": area_texts # All areas in the cap
}

# Convert the dictionary to a dataframe
df = pd.DataFrame([info_dict])

# Save the dataframe to a CSV file (optional)
df.to_csv("weather_alert_info.csv", index=False)

print(df)

  language        event responseType urgency  severity certainty  \
0    en-CA  air quality      Monitor  Future  Moderate  Possible   

         audience                  effective                    expires  \
0  general public  2023-08-11T04:55:42-00:00  2023-08-11T20:55:42-00:00   

                                      senderName  \
0  Environment Canada, the Government of Nunavut   

                                  headline  \
0  special air quality statement in effect   

                                         description  \
0  ###\n\nWildfire smoke can be harmful to everyo...   

                         area  
0  [Cambridge Bay, Kugluktuk]  


Feature Engineering

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   language      1 non-null      object
 1   event         1 non-null      object
 2   responseType  1 non-null      object
 3   urgency       1 non-null      object
 4   severity      1 non-null      object
 5   certainty     1 non-null      object
 6   audience      1 non-null      object
 7   effective     1 non-null      object
 8   expires       1 non-null      object
 9   senderName    1 non-null      object
 10  headline      1 non-null      object
 11  description   1 non-null      object
 12  area          1 non-null      object
dtypes: object(13)
memory usage: 232.0+ bytes


In [87]:
# Convert effective and expires to datetime
df['expires'] = pd.to_datetime(df['expires']) 
df['effective'] = pd.to_datetime(df['effective']) 

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   language      1 non-null      object             
 1   event         1 non-null      object             
 2   responseType  1 non-null      object             
 3   urgency       1 non-null      object             
 4   severity      1 non-null      object             
 5   certainty     1 non-null      object             
 6   audience      1 non-null      object             
 7   effective     1 non-null      datetime64[ns, UTC]
 8   expires       1 non-null      datetime64[ns, UTC]
 9   senderName    1 non-null      object             
 10  headline      1 non-null      object             
 11  description   1 non-null      object             
 12  area          1 non-null      object             
dtypes: datetime64[ns, UTC](2), object(11)
memory usage: 232.0+ bytes


## Insert data into our Database

In [89]:
my_db_secret = {
    'drivername': 'postgresql+psycopg2', 
    'host': 'mmai5100postgres.canadacentral.cloudapp.azure.com',
    'port': '5432', 
    'username': 'szhan872', 
    'password': '2023!Schulich',
    'database': 'szhan872_db'
}

In [90]:
my_db_url = sa.engine.URL.create(
    drivername = my_db_secret['drivername'],
    username = my_db_secret['username'],
    password = my_db_secret['password'],
    host = my_db_secret['host'],
    port = my_db_secret['port'],
    database = my_db_secret['database'])

In [91]:
print(my_db_url)
my_db_engine = sa.create_engine(my_db_url)

postgresql+psycopg2://szhan872:2023!Schulich@mmai5100postgres.canadacentral.cloudapp.azure.com:5432/szhan872_db


In [92]:
# Create Schema
with my_db_engine.connect() as connection: 
    connection.execute('CREATE SCHEMA IF NOT EXISTS uploads')

In [93]:
df.columns

Index(['language', 'event', 'responseType', 'urgency', 'severity', 'certainty',
       'audience', 'effective', 'expires', 'senderName', 'headline',
       'description', 'area'],
      dtype='object')

In [94]:
df.to_sql(
    name      = 'alerts',
    con       = my_db_engine,
    schema    = 'uploads',
    if_exists = 'replace',
    index     = False,
    method    = 'multi',
    dtype     = {
        'language'    : sa.types.VARCHAR(10),
        'event'       : sa.types.VARCHAR(50),
        'responseType': sa.types.VARCHAR(30),
        'urgency'     : sa.types.VARCHAR(30),
        'severity'    : sa.types.VARCHAR(30),
        'certainty'   : sa.types.VARCHAR(30),
        'audience'    : sa.types.VARCHAR(50),
        'effective'   : sa.types.DateTime,
        'expires'     : sa.types.DateTime,
        'senderName'  : sa.types.VARCHAR(50),
        'headline'    : sa.types.VARCHAR(100),
        'description' : sa.types.VARCHAR(),
        'area'        : sa.types.VARCHAR(50)
    },  
)

1