In [47]:
import pandas as pd
from datetime import datetime, timedelta

In [None]:
# Spot for user-specified options
# This script is built to access only the geomet daily climate data
# Site for discovery of climate obs and IDs: https://api.weather.gc.ca/collections/climate-daily/items?limit=10&offset=0&bbox=-180%2C34.84%2C131.89%2C82.41&sortby=-LOCAL_DATE
# Queryable parameters: https://api.weather.gc.ca/collections/climate-daily/queryables# 

# To do: Add a 2nd criterion for streaks, e.g. temperature and humidty. 

# ------ User-specified options below ------
# dt_start/end: String, format=YYYY-MM-DD, enclosed in quotes
# clim_ids: Strings ECCC Climate Identifiers, separated by pipe (|) for multiple. Enclosed in quotes
# params: List of available parameters to return, separated by '%2C'. LOCAL_DATE is required. 
# api_limit: Integer. The limit per API request. Max is 10,000 for Geomet daily climate data.
dt_start = "1950-01-01"
dt_end = "2025-12-29"
clim_ids = "2101300|2101303|2101415|2101290"
params = "ID%2CLOCAL_DATE%2CMAX_TEMPERATURE%2CMIN_TEMPERATURE%2CMEAN_TEMPERATURE" 
api_limit = 10000
# ---- End of user-specified options. Nothing below here should be changed. 

# Options: From geomet doc page. Put all options here
base_url = "https://api.weather.gc.ca/collections/climate-daily/items"
url_opts = "".join([f"?f=csv&limit={api_limit}",
                    f"&properties={params}",
                    "&sortby=LOCAL_DATE"])
clim_id = "".join([f"&CLIMATE_IDENTIFIER={clim_ids}"])

# Calculate number of days requested
start = datetime.strptime(dt_start, '%Y-%m-%d')
end = datetime.strptime(dt_end, '%Y-%m-%d')
total_days = (end - start).days + 1

geo_urls = []
if total_days <= api_limit:
    geo_url = "".join([base_url, url_opts, f"&datetime={dt_start}/{dt_end}", clim_id])
    geo_urls = [geo_url]
else:
    # Build as many calls as needed in api_limit-sized chunks
    current_start = start
    while current_start <= end:
        chunk_end = min(current_start + timedelta(days=api_limit - 1), end)
        dt_string = f"&datetime={current_start.strftime('%Y-%m-%d')}/{chunk_end.strftime('%Y-%m-%d')}"
        geo_url_chunk = "".join([base_url, url_opts, dt_string, clim_id])
        geo_urls.append(geo_url_chunk)
        current_start = chunk_end + timedelta(days=1)

# For backward compatibility, set geo_url to the first one if single
if len(geo_urls) == 1:
    geo_url = geo_urls[0]


In [49]:
# Retrieve data into a pandas df
if 'geo_urls' in locals() and len(geo_urls) > 1:
    df = pd.concat([pd.read_csv(url) for url in geo_urls], ignore_index=True)
else:
    df = pd.read_csv(geo_url)

In [50]:
# Convert data types
df['LOCAL_DATE'] = pd.to_datetime(df['LOCAL_DATE'])
df['ID'] = df['ID'].astype(str)

# Replace any non-numeric characters in temperature columns with NA for the purposes of sorting. 
params_list = params.split('%2C')
variable_cols = params_list[params_list.index('LOCAL_DATE') + 1:]
df[variable_cols] = df[variable_cols].apply(pd.to_numeric, errors='coerce')


# Sort and drop duplicates.
df = df.sort_values(['LOCAL_DATE','MIN_TEMPERATURE','MAX_TEMPERATURE']).drop_duplicates('LOCAL_DATE')

In [51]:
# Check for missing dates
min_date = df['LOCAL_DATE'].dt.date.min()
max_date = df['LOCAL_DATE'].dt.date.max()
all_dates = pd.date_range(start=min_date, end=max_date, freq='D')
existing_dates = df['LOCAL_DATE'].dt.date
missing_dates = all_dates.difference(existing_dates)



In [52]:
# Find periods where MAX_TEMPERATURE did not exceed -20.0 for 3 or more consecutive days
# First, create a dataframe with all dates filled
min_date = df['LOCAL_DATE'].min().date()
max_date = df['LOCAL_DATE'].max().date()
full_dates = pd.date_range(start=min_date, end=max_date, freq='D')
df_full = df.set_index('LOCAL_DATE').reindex(full_dates).rename_axis('LOCAL_DATE').reset_index()

# Condition: MAX_TEMPERATURE <= -20.0 or NaN (treat NaN as part of streak)
condition = (df_full['MAX_TEMPERATURE'] <= -20.0) | df_full['MAX_TEMPERATURE'].isna()

# Identify consecutive streaks
df_full['streaknum'] = (condition != condition.shift()).cumsum()

# Group by streak and get periods where condition is True for >=3 days
streaks = df_full[condition].groupby('streaknum').agg(
    start=('LOCAL_DATE', 'min'),
    end=('LOCAL_DATE', 'max'),
    count=('LOCAL_DATE', 'count'),
    missing=('MAX_TEMPERATURE', lambda x: x.isna().sum())
)
long_streaks_max = streaks[streaks['count'] >= 5].sort_values('count', ascending=False)
long_streaks_max['type'] = 'MAX_TEMPERATURE'

print("Periods where MAX_TEMPERATURE did not exceed -20.0 for 5 or more consecutive days:")
print(long_streaks_max)

Periods where MAX_TEMPERATURE did not exceed -20.0 for 5 or more consecutive days:
               start        end  count  missing             type
streaknum                                                       
673       2004-08-10 2004-10-03     55       55  MAX_TEMPERATURE
225       1968-12-27 1969-01-22     27        0  MAX_TEMPERATURE
243       1971-01-09 1971-01-31     23        0  MAX_TEMPERATURE
263       1972-02-16 1972-03-09     23       22  MAX_TEMPERATURE
601       1996-01-08 1996-01-30     23       19  MAX_TEMPERATURE
...              ...        ...    ...      ...              ...
469       1992-12-27 1992-12-31      5        2  MAX_TEMPERATURE
403       1985-02-09 1985-02-13      5        0  MAX_TEMPERATURE
653       2001-12-15 2001-12-19      5        0  MAX_TEMPERATURE
631       1999-01-05 1999-01-09      5        0  MAX_TEMPERATURE
901       2019-01-06 2019-01-10      5        0  MAX_TEMPERATURE

[132 rows x 5 columns]


In [53]:
# Find periods where MIN_TEMPERATURE reached at least -30.0 for 3 or more consecutive days
# First, create a dataframe with all dates filled
min_date = df['LOCAL_DATE'].min().date()
max_date = df['LOCAL_DATE'].max().date()
full_dates = pd.date_range(start=min_date, end=max_date, freq='D')
df_full = df.set_index('LOCAL_DATE').reindex(full_dates).rename_axis('LOCAL_DATE').reset_index()

# Condition: MIN_TEMPERATURE <= -30.0 or NaN (treat NaN as part of streak)
condition = (df_full['MIN_TEMPERATURE'] <= -30.0) | df_full['MIN_TEMPERATURE'].isna()

# Identify consecutive streaks
df_full['streak'] = (condition != condition.shift()).cumsum()

# Group by streak and get periods where condition is True for >=3 days
streaks = df_full[condition].groupby('streak').agg(
    start=('LOCAL_DATE', 'min'),
    end=('LOCAL_DATE', 'max'),
    count=('LOCAL_DATE', 'count'),
    missing=('MIN_TEMPERATURE', lambda x: x.isna().sum())
)
long_streaks_min = streaks[streaks['count'] >= 5].sort_values('count', ascending=False)
long_streaks_min['type'] = 'MIN_TEMPERATURE'

print("Periods where MIN_TEMPERATURE reached at least -30.0 for 5 or more consecutive days:")
print(long_streaks_min)

Periods where MIN_TEMPERATURE reached at least -30.0 for 5 or more consecutive days:
            start        end  count  missing             type
streak                                                       
815    2004-08-10 2004-10-03     55       55  MIN_TEMPERATURE
305    1971-01-09 1971-01-31     23        0  MIN_TEMPERATURE
327    1972-02-16 1972-03-09     23       22  MIN_TEMPERATURE
857    2007-04-08 2007-04-30     23       23  MIN_TEMPERATURE
283    1969-01-06 1969-01-23     18        0  MIN_TEMPERATURE
...           ...        ...    ...      ...              ...
731    1996-11-20 1996-11-24      5        0  MIN_TEMPERATURE
735    1996-12-22 1996-12-26      5        0  MIN_TEMPERATURE
1031   2016-12-12 2016-12-16      5        0  MIN_TEMPERATURE
1063   2019-01-06 2019-01-10      5        0  MIN_TEMPERATURE
1051   2017-12-27 2017-12-31      5        0  MIN_TEMPERATURE

[117 rows x 5 columns]


In [54]:
# Export the stats and the list of missing data to csv files in the current wd

# List of missing dates
df_missing = pd.DataFrame({'missing_dates': missing_dates})
df_missing.to_csv('missing_dates.csv', index=False)
print("Missing dates exported to missing_dates.csv")

# df of all sorted data
df.to_csv('geomet_climate_data.csv', index=False)
print("Geomet climate data exported to geomet_climate_data.csv")

# streak dataframes combined
all_streaks = pd.concat([long_streaks_max, long_streaks_min], ignore_index=True)
all_streaks.to_csv('cold_streaks.csv', index=False)
print("Cold streaks data exported to cold_streaks.csv")

Missing dates exported to missing_dates.csv
Geomet climate data exported to geomet_climate_data.csv
Cold streaks data exported to cold_streaks.csv
