Getting familiar with all CO and PPH data

In [None]:
import numpy as np
import geopandas as gp
import shapely as sp
import matplotlib.pyplot as plt
import contextily as cx
import cartopy as cp
from datetime import datetime as dt
from datetime import date as dt_date
from datetime import timedelta
import xarray as xr
import pandas as pd
import re
import os
from utils_filter import *
from utils_datetime import *

In [None]:
raw_dir = '~/Downloads'
outlook_raw_location = raw_dir + '/outlooks'
report_raw_location = raw_dir + '/storm_reports'
pph_raw_location = raw_dir + '/pph'
outlook_save_location = 'data/outlooks'
report_save_location = 'data/storm_reports'
pph_save_location = 'data/pph'

# Read in Convective outlooks and filter moderate days

In [None]:
# Read convective outlooks into outlooks
year_list = [[1987, 1991], [1992, 1999], [2000, 2007], [2008, 2015], [2016, 2023]]
outlook_list = []

for years, i in zip(year_list, range(len(year_list))):
    print('reading file ' + str(i) + ', years ' + str(years[0]) + '-' + str(years[1]))
    gdf = gp.read_file(outlook_raw_location + '/outlooks_' + str(years[0]) + '01010000_' + str(years[1]) + '12312359')
    outlook_list.append(gdf)

# Concatenate all GeoDataFrames into one
outlooks_original = pd.concat(outlook_list, ignore_index=True)

print('files read')
    
outlooks_original

In [None]:
# make dates datetime
outlooks = outlooks_original.copy()
outlooks['ISSUE'] = parse_datetime(outlooks['ISSUE'])
outlooks['EXPIRE'] = parse_datetime(outlooks['EXPIRE'])
outlooks['PRODISS'] = parse_datetime(outlooks['PRODISS'])

In [None]:
# reset incicies
outlooks = outlooks.reset_index(drop=True)
outlooks

In [None]:
outlooks = fix_month_issue(outlooks)    
outlooks

In [None]:
# add column with just valid date
outlooks['DATE'] = create_dates(outlooks['EXPIRE'], -1)



In [None]:
# timestamps = outlooks[outlooks['CYCLE'] == 8]
# timestamps[timestamps['PRODISS'].dt.hour == 11]
# acceptable ranges of times for each cycle: prodiss can be all over the place, so we need to just pick a reasonable range of a couple hours based on what issue times we see looking at the -1 cycle forecasts on missing days and what's not super rare in cycle 6 outlooks.
# 6: 4-8Z
# 7: 5-9Z
# 17: 16-19Z
# 8: 6-11Z
acceptable_time_dict = {
    6: [4, 8],
    7: [5, 9],
    17: [16, 19],
    8: [6, 11]
}

In [None]:
# change cycle to the correct value when there is only -1
outlooks[(outlooks['DAY'] == 1) & (outlooks['DATE'] == dt_date(1987, 2, 14))][0:50]

In [None]:
year = 0
# Iterate only once through the dataset by date
for date in outlooks['DATE'].unique():
    curr_year = date.year
    if curr_year != year:
        print('year', curr_year)
        year = curr_year

    date_outlooks = outlooks[outlooks['DATE'] == date]

    for day, cycle in zip([1, 2, 2, 3], [6, 7, 17, 8]):
        for categorical in [True, False]:
            category_mask = (date_outlooks['CATEGORY'] == 'CATEGORICAL') if categorical else (date_outlooks['CATEGORY'] != 'CATEGORICAL')
            present = ((date_outlooks['DAY'] == day) &
                       (date_outlooks['CYCLE'] == cycle) &
                       category_mask)

            if not present.any():
                # define acceptable hour range
                hour_min, hour_max = acceptable_time_dict[cycle]

                # build mask once
                mask = (
                    (outlooks['DATE'] == date) &
                    (outlooks['DAY'] == day) &
                    (outlooks['CYCLE'] == -1) &
                    ((outlooks['CATEGORY'] == 'CATEGORICAL') if categorical else (outlooks['CATEGORY'] != 'CATEGORICAL')) &
                    (outlooks['PRODISS'].dt.hour >= hour_min) &
                    (outlooks['PRODISS'].dt.hour <= hour_max)
                )

                if mask.any():
                    label = 'categorical' if categorical else 'probabilistic'
                    print(f'changing cycle for {label} outlooks on {date} day {day} cycle {cycle}')
                    outlooks.loc[mask, 'CYCLE'] = cycle


In [None]:
# checking if changes worked
outlooks[(outlooks['DAY'] == 1) & (outlooks['DATE'] == dt_date(2009, 5, 28))][0:50]
#['200204190000', '200204200000', '200204210000', '200204250000', '200205060000', '200205250000', '200207310000', '200208130000', '200208300000', '200211090000', '200212230000', '200302030000', '200303250000', '200304140000', '200304150000', '200304160000', '200305100000', '200306250000', '200306280000', '200307270000', '200307280000', '200309030000', '200312280000', '200404020000', '200404140000', '200405230000', '200408090000', '200410140000', '200503300000', '200506060000', '200508030000', '200701040000', '200905280000', '201105210000', '202005240000', '202106130000']

In [None]:
# identify dates with MDT
mod_dates = identify_dates_above_threshold(outlooks, 'MDT')

In [None]:
# Plot number of MDT days by year
years_of_mdt = get_years(mod_dates)
plt.hist(years_of_mdt, bins=range(min(years_of_mdt), max(years_of_mdt) + 1, 1))

In [None]:
# dataframe containing only outlooks for days in which there was a MDT risk
mdt_outlooks = outlooks[outlooks['DATE'].isin(mod_dates)]

# convert datetimes back to strings
outlooks = revert_all_datetimes(outlooks)
mdt_outlooks = revert_all_datetimes(mdt_outlooks)

# save dataframes
outlooks.iloc[:int(len(outlooks)/2)].to_file(outlook_save_location + '/all_outlooks_1.shp')
outlooks.iloc[int(len(outlooks)/2)+1:].to_file(outlook_save_location + '/all_outlooks_2.shp')
mdt_outlooks.to_file(outlook_save_location + '/mdt_outlooks.shp')


# Now read, combine, filter (to mdt), and save PPH data

In [None]:
hazard_types = ['wind', 'sig_wind', 'hail', 'sig_hail', 'tor', 'sig_tor', 'all_svr', 'all_sig_svr']
for hazard, i in zip(hazard_types, range(len(hazard_types))):
    print('reading in ' + hazard + ' pph')
    if i == 0:
        pph_data = xr.open_dataset(pph_raw_location + '/pper_' + hazard + '_1979_2023.nc')
    else:
        new_data = xr.open_dataset(pph_raw_location + '/pper_' + hazard + '_1979_2023.nc')
        pph_data = xr.merge([pph_data, new_data])




In [None]:
# select pph data on days with mdt risk
pph_data_mod = pph_data.sel(time=pph_data.time.dt.date.isin(mod_dates.tolist()))

In [None]:
# save full and moderate pph datasets
pph_data.to_netcdf(pph_save_location + '/all_pph.nc')
pph_data_mod.to_netcdf(pph_save_location + '/mdt_pph.nc')

In [None]:
pph_data

# Read in, combine, filter, and save raw storm reports

In [None]:
columns =['STATE', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_NAME', 'CZ_NAME', 'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'END_DATE_TIME', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'SOURCE', 'MAGNITUDE', 'MAGNITUDE_TYPE', 'TOR_F_SCALE', 'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE', 'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH', 'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON']
event_types = ['Funnel Cloud', 'Hail', 'Marine Hail', 'Marine Thunderstorm Wind', 'Thunderstorm Wind', 'Tornado', 'Waterspout']




In [None]:
# read in reports and combine into all_reports
first = True
report_raw_location = os.path.expanduser(report_raw_location)
all_reports_list = []
for file in os.listdir(report_raw_location):
    filename = os.fsdecode(file)
    if 'StormEvents_details-ftp_v1.0_d' in filename:
        reports = pd.read_csv(os.path.join(report_raw_location, filename))
        filtered = filter_reports(reports, columns, event_types)
        all_reports_list.append(filtered)

# Combine all at once
all_reports = pd.concat(all_reports_list, ignore_index=True)


: 

In [None]:
# filter all_reports to get all_reports_mdt
all_reports['DATE'] = parse_datetime_reports(all_reports['BEGIN_DATE_TIME'])
all_reports_mdt = all_reports[all_reports['DATE'].isin(mod_dates.tolist())]



In [None]:
# save report data
all_reports.to_csv(report_save_location + '/all_reports.csv')
all_reports_mdt.to_csv(report_save_location + '/mdt_reports.csv')
