In [None]:
from utils_filter import *
from utils_datetime import *
from utils_geography import *
from dateutil import parser
from dateutil.relativedelta import relativedelta
from collections import Counter
import math
from shapely.vectorized import contains

# Open and Pre-process

In [None]:
data_location = 'data'
# outlooks, pph, reports = read_datasets(data_location)
outlook1 = gp.read_file(f"{data_location}/outlooks/all_outlooks_1.shp", engine="pyogrio")
outlook2 = gp.read_file(f"{data_location}/outlooks/all_outlooks_2.shp", engine="pyogrio")
outlooks = gp.GeoDataFrame(pd.concat([outlook1, outlook2], ignore_index=True), crs=outlook1.crs)
pph = xr.open_dataset(data_location + '/pph/all_pph.nc')

In [None]:
tz_conversions = {'PST': timedelta(hours=8),
                  'MST': timedelta(hours=7),
                  'CST': timedelta(hours=6),
                  'CSt': timedelta(hours=6),
                  'CSC': timedelta(hours=6),
                  'SCT': timedelta(hours=6),
                  'EST': timedelta(hours=5),
                  'ESt': timedelta(hours=5),
                  'PDT': timedelta(hours=7),
                  'MDT': timedelta(hours=6),
                  'CDT': timedelta(hours=5),
                  'EDT': timedelta(hours=4),
                  'HST': timedelta(hours=10),
                  'SST': timedelta(hours=11),
                  'GST': timedelta(hours=10),
                  'AKS': timedelta(hours=9),
                  'AST': timedelta(hours=4),
                  'UNK': timedelta(hours=5),
                  'GMT': timedelta(0)}

def get_reports_date_strings(date_times, timezones):
    # returns list of strings of date of given datetime and timezone (where day cutoffs are 12-12 UTC) formatted as 'YYYYMMDD0000'
    for datetime, timezone, i in zip(date_times, timezones, range(len(timezones))):
        #print(datetime + ' ' + timezone[:3])
        datetime = parser.parse(datetime)
        datetime = datetime + tz_conversions[timezone[:3]]
        #print(datetime)
        if (datetime.hour < 12):
            datetime = datetime - timedelta(days = 1)
        if datetime.year > 2049:
            datetime = datetime - relativedelta(years = 100)
        datetime = datetime.strftime("%Y%m%d") + '0000'
        if i == 0:
            ret = [datetime]
        else:
            ret.append(datetime)
    return ret

def get_pph_date_strings(times):
    # returns a list of strings of given dates formatted as 'YYYYMMDD0000'
    for datetime, i in zip(times, range(len(times))):
        string = datetime.dt.strftime("%Y%m%d").values + '0000'
        if i == 0:
            ret = [string]
        else:
            ret.append(string)
    return ret

In [None]:
#reports['DATE'] = get_reports_date_strings(reports['BEGIN_DATE_TIME'], reports['CZ_TIMEZONE']) 
pph['time'] = get_pph_date_strings(pph.time) 
# subset outlooks into only one day 1, two day 2, and one day 3 categorical outlooks 
# day 3: cycle not -1. day 2: cycle not -1. Day 1: cycle 6. Category: categorical. 
#outlooks = outlooks[(((outlooks['DAY'] == 1) & (outlooks['CYCLE'] == 6)) | ((outlooks['DAY'] == 2) & (outlooks['CYCLE'] != -1)) | ((outlooks['DAY'] == 3) & (outlooks['CYCLE'] != -1)))
#        & (outlooks['CATEGORY'] == 'CATEGORICAL')]

# reset incicies
outlooks = outlooks.reset_index(drop=True)
#reports = reports.drop(columns=['geometry'])

In [None]:
outlooks

In [None]:
# old_outlook_dataset = xr.open_dataset('data/outlooks/grid_outlooks.nc')

# Gridize outlooks

In [None]:
def get_outlooks_subset(outlooks, outlook_type):
    outlooks = outlooks[(outlooks['THRESHOLD'] != 'SIGN') & (outlooks['THRESHOLD'] != 'TSTM')]
    if outlook_type == 'Day 3':
        return outlooks[(outlooks['DAY'] == 3) & (outlooks['CYCLE'] != -1) & (outlooks['CATEGORY'] == 'ANY SEVERE')]
    elif outlook_type == 'Day 2 7':
        return outlooks[(outlooks['DAY'] == 2) & (outlooks['CYCLE'] == 7) & (outlooks['CATEGORY'] == 'ANY SEVERE')]
    elif outlook_type == 'Day 2 17':
        return outlooks[(outlooks['DAY'] == 2) & (outlooks['CYCLE'] == 17) & (outlooks['CATEGORY'] == 'ANY SEVERE')]
    elif outlook_type == 'Day 1':
        return outlooks[(outlooks['DAY'] == 1) & (outlooks['CYCLE'] == 6) & (outlooks['CATEGORY'] == 'ANY SEVERE')]
    elif outlook_type == 'Day 1 Wind':
        return outlooks[(outlooks['DAY'] == 1) & (outlooks['CYCLE'] == 6) & (outlooks['CATEGORY'] == 'WIND')]
    elif outlook_type == 'Day 1 Hail':
        return outlooks[(outlooks['DAY'] == 1) & (outlooks['CYCLE'] == 6) & (outlooks['CATEGORY'] == 'HAIL')]
    elif outlook_type == 'Day 1 Tornado':
        return outlooks[(outlooks['DAY'] == 1) & (outlooks['CYCLE'] == 6) & (outlooks['CATEGORY'] == 'TORNADO')]
    
    elif outlook_type == 'Day 2 7 Wind':
        return outlooks[(outlooks['DAY'] == 2) & (outlooks['CYCLE'] == 7) & (outlooks['CATEGORY'] == 'WIND')]
    elif outlook_type == 'Day 2 17 Wind':
        return outlooks[(outlooks['DAY'] == 2) & (outlooks['CYCLE'] == 17) & (outlooks['CATEGORY'] == 'WIND')]
    elif outlook_type == 'Day 2 7 Hail':
        return outlooks[(outlooks['DAY'] == 2) & (outlooks['CYCLE'] == 7) & (outlooks['CATEGORY'] == 'HAIL')]
    elif outlook_type == 'Day 2 17 Hail':
        return outlooks[(outlooks['DAY'] == 2) & (outlooks['CYCLE'] == 17) & (outlooks['CATEGORY'] == 'HAIL')]
    elif outlook_type == 'Day 2 7 Tornado':
        return outlooks[(outlooks['DAY'] == 2) & (outlooks['CYCLE'] == 7) & (outlooks['CATEGORY'] == 'TORNADO')]
    elif outlook_type == 'Day 2 17 Tornado':
        return outlooks[(outlooks['DAY'] == 2) & (outlooks['CYCLE'] == 17) & (outlooks['CATEGORY'] == 'TORNADO')]
    
    raise Exception("Invalid outlook_type given to get_outlooks_subset")

In [None]:
# as in mca: create_gridded_outlook_dataset. But will need to work overall, not just by hazard. day 3 is overall probability, days 1 and 2 by hazard
# day 3 use ANY SEVERE, day 2 use ANY SEVERE, day 1 construct from each hazard (assuming independence? or use highest prob, which basically assumes complete dependence? Is this what is done for day 2?)
# Things are entirely categorical before 2002, so even more reason to make that the cutoff.

outlook_types = ['Day 3', 'Day 2 7', 'Day 2 17', 
                 'Day 2 7 Wind', 'Day 2 7 Hail', 'Day 2 7 Tornado', 
                 'Day 2 17 Wind', 'Day 2 17 Hail', 'Day 2 17 Tornado',
                 'Day 1 Wind', 'Day 1 Hail', 'Day 1 Tornado', 
                 'Day 1']

outlook_dataset = xr.Dataset(
    data_vars=dict(
        lat=(['y', 'x'], pph['lat'].data),
        lon=(['y', 'x'], pph['lon'].data)
    ),
    coords=dict(
        time=(['time'], pph['time'].data),
        x=(['x'], pph['x'].data),
        y=(['y'], pph['y'].data),
        outlook=(['outlook'], outlook_types)
    ),
    attrs=dict(description="outlook as a percentage as a function of date, lat/lon, and which hazard type",
            grid = pph.grid),
)

outlook_dataset = outlook_dataset.assign(prob = (('time', 'y', 'x', 'outlook'), np.full((len(outlook_dataset['time']), len(outlook_dataset['y']), len(outlook_dataset['x']), len(outlook_types)), 0.0)))





In [None]:
# adding an additional year
#outlook_dataset = old_outlook_dataset.reindex(time=pph['time'], fill_value=0)
#del old_outlook_dataset
#outlook_dataset
# just full of zeros in new year, fill below

In [None]:
todo_times = outlook_dataset['time']#[outlook_dataset['time'].str.startswith('2023')]

In [None]:
save_location = 'data/outlooks/grid_outlooks.nc'

In [None]:
def rasterize_polygons(polygons, thresholds, lat_grid, lon_grid):
    result = np.zeros(lat_grid.shape)
    for poly, thresh in zip(polygons, thresholds):
        mask = contains(poly, lon_grid, lat_grid)
        result[mask & (result == 0)] = thresh  # only fill unassigned cells
    return result

# Extract static lat/lon once
lat_grid = outlook_dataset.lat.values  # shape (y, x)
lon_grid = outlook_dataset.lon.values

# Loop over each outlook type
for outlook_type in outlook_types:
    print(outlook_type)
    outlooks_subset = get_outlooks_subset(outlooks, outlook_type)
    grouped_outlooks = dict(tuple(outlooks_subset.groupby('DATE')))

    array = np.zeros((len(todo_times), lat_grid.shape[0], lat_grid.shape[1]))

    oldyear = None
    for i, date in enumerate(todo_times.values):
        #year = date[:4]
        #if year != oldyear:
        #    print(year)
        #    oldyear = year

        outlooks_date = grouped_outlooks.get(date)
        if outlooks_date is not None and len(outlooks_date) > 0:
            outlooks_date = outlooks_date.sort_values(by='THRESHOLD', ascending=False)
            polygons = outlooks_date['geometry'].tolist()
            thresholds = outlooks_date['THRESHOLD'].tolist()

            array[i] = rasterize_polygons(polygons, thresholds, lat_grid, lon_grid)

    # Convert to DataArray and assign into dataset
    me = xr.DataArray(array, coords={
        'time': todo_times,
        'y': outlook_dataset.y.values,
        'x': outlook_dataset.x.values
    }, dims=['time', 'y', 'x'])

    outlook_dataset['prob'].loc[dict(outlook=outlook_type, time=todo_times)] = me

    # Save after each outlook type (optional)
    outlook_dataset.to_netcdf(save_location)

In [None]:
outlook_dataset.to_netcdf('data/outlooks/grid_outlooks.nc')

In [None]:
outlook_dataset['prob'].mean()

In [None]:
outlook_dataset = xr.open_dataset(save_location)
outlook_dataset

In [None]:
outlook_dataset.sel(outlook = 'Day 1', time = '202304060000').prob.max(dim = ['x', 'y'])

: 

In [None]:
# build day 1 and Feb 1 2020- day 2 probabilities: max (assuming total dependence) seems right, set to True
dependent = True

for outlook in ['Day 1', 'Day 2 7', 'Day 2 17']:
    print(outlook)
    year = ''
    for time in outlook_dataset['time']:
        time = str(time.values)
        newyear = time[0:4]
        if newyear != year:
            print(newyear)
            year = newyear
        if dependent:
            m = outlook_dataset['prob'].sel(time = time, outlook = [outlook + ' Wind', outlook + ' Hail', outlook + ' Tornado']).max(dim = 'outlook')
            
        else:
            m = (1-(1-outlook_dataset['prob'].sel(time = time, outlook = [outlook + ' Wind'])).data * (1-outlook_dataset['prob'].sel(time = time, outlook = [outlook + ' Hail'])).data * (1-outlook_dataset['prob'].sel(time = time, outlook = [outlook + ' Tornado'])).data)[:, :, 0]
        if m.values.max() > 0:
            outlook_dataset['prob'].loc[dict(time = time, outlook = outlook)] = m

In [None]:
outlook_dataset.to_netcdf('data/outlooks/grid_outlooks.nc')

# Plot

In [None]:
# check that max was a reasonable way to combine hazard probabilities
plt.plot(np.convolve(outlook_dataset['prob'].sel(outlook = 'Day 1').max(dim=['x', 'y']).data[8489:], np.ones(365)/365, 'same'))
plt.plot(np.convolve(outlook_dataset['prob'].sel(outlook = 'Day 2 17').max(dim=['x', 'y']).data[8489:], np.ones(365)/365, 'same'))
plt.plot(np.convolve(outlook_dataset['prob'].sel(outlook = 'Day 2 7').max(dim=['x', 'y']).data[8489:], np.ones(365)/365, 'same'))
plt.plot(np.convolve(outlook_dataset['prob'].sel(outlook = 'Day 3').max(dim=['x', 'y']).data[8489:], np.ones(365)/365, 'same'))
plt.legend(['Day 1', 'Day 2 17z', 'Day 2 7z', 'Day 3'])
if dependent:
    plt.title('Daily Maximum Any-Hazard Probability Over Time')
else:
    plt.title('Daily Maximum Any-Hazard Probability Over Time (assuming independent hazards)')
plt.ylabel('1-year Running Mean Daily Maximum Any-Hazard Probability')
plt.xlabel('Days Since March 30, 2002')
if dependent:
    plt.savefig('plots/prob_over_time.png')
else:
    plt.savefig('plots/prob_over_time_indep.png')


# This is kind of an interesting result on its own. Forecast practices have changed... higher risks issued less liberally