In [1]:
from utils_filter import *
from utils_datetime import *
from utils_geography import *
from dateutil import parser
from dateutil.relativedelta import relativedelta
import math


# TODO: 201804020000's day 3 forecasts are issued on 04/30--in reality, these forecasts were probably meant for 05/02. todo is clean for this sort of thing. Potentially in cleaning file

Subsets outlooks, pph, and report data in various ways (add label to each datapoint for each method of subsetting so can be pulled out of full dataset later)


In [2]:
# read in data
data_location = 'data'
moderate = False # only consider moderate days
labelled = True # start with already labelled data
outlooks, pph, reports = read_datasets(data_location, moderate, labelled)

reading outlooks
reading pph
reading storm reports


In [3]:
tz_conversions = {'PST': timedelta(hours=8),
                  'MST': timedelta(hours=7),
                  'CST': timedelta(hours=6),
                  'CSt': timedelta(hours=6),
                  'CSC': timedelta(hours=6),
                  'SCT': timedelta(hours=6),
                  'EST': timedelta(hours=5),
                  'ESt': timedelta(hours=5),
                  'PDT': timedelta(hours=7),
                  'MDT': timedelta(hours=6),
                  'CDT': timedelta(hours=5),
                  'EDT': timedelta(hours=4),
                  'HST': timedelta(hours=10),
                  'SST': timedelta(hours=11),
                  'GST': timedelta(hours=10),
                  'AKS': timedelta(hours=9),
                  'AST': timedelta(hours=4),
                  'UNK': timedelta(hours=5),
                  'GMT': timedelta(0)}

def get_reports_date_strings(date_times, timezones):
    # returns list of strings of date of given datetime and timezone (where day cutoffs are 12-12 UTC) formatted as 'YYYYMMDD0000'
    for datetime, timezone, i in zip(date_times, timezones, range(len(timezones))):
        #print(datetime + ' ' + timezone[:3])
        datetime = parser.parse(datetime)
        datetime = datetime + tz_conversions[timezone[:3]]
        #print(datetime)
        if (datetime.hour < 12):
            datetime = datetime - timedelta(days = 1)
        if datetime.year > 2049:
            datetime = datetime - relativedelta(years = 100)
        datetime = datetime.strftime("%Y%m%d") + '0000'
        if i == 0:
            ret = [datetime]
        else:
            ret.append(datetime)
    return ret

def get_pph_date_strings(times):
    # returns a list of strings of given dates formatted as 'YYYYMMDD0000'
    for datetime, i in zip(times, range(len(times))):
        string = datetime.dt.strftime("%Y%m%d").values + '0000'
        if i == 0:
            ret = [string]
        else:
            ret.append(string)
    return ret

In [4]:
# add dates to reports and pph in same format as in outlooks
if labelled == False:
    reports['DATE'] = get_reports_date_strings(reports['BEGIN_DATE_TIME'], reports['CZ_TIMEZONE']) 
    pph['time'] = get_pph_date_strings(pph.time) 
    # subset outlooks into only one day 1, two day 2, and one day 3 categorical outlooks 
    # day 3: cycle not -1. day 2: cycle not -1. Day 1: cycle 6. Category: categorical or none.
    outlooks = outlooks[(((outlooks['DAY'] == 1) & (outlooks['CYCLE'] == 6)) | ((outlooks['DAY'] == 2) & (outlooks['CYCLE'] != -1)) | ((outlooks['DAY'] == 3) & (outlooks['CYCLE'] != -1)))
            & (outlooks['CATEGORY'] != 'WIND') & (outlooks['CATEGORY'] != 'HAIL') & (outlooks['CATEGORY'] != 'TORNADO')& (outlooks['CATEGORY'] != 'ANY SEVERE')& (outlooks['CATEGORY'] != 'PROB')]

    # reset incicies
    outlooks = outlooks.reset_index(drop=True)

In [5]:
def add_outlooks_label(outlooks, label_dates, labels, label_name, none_label):
    # adds new column with values from labels on the corresponding list of dates in label_dates (DONE)
    print("adding a new column in outlooks")
    outlooks[label_name] = none_label
    for label, dates in zip(labels, label_dates):
        #print(label)
        outlooks[label_name].loc[outlooks['DATE'].isin(dates)] = label
    return outlooks

def add_pph_label(pph, label_dates, labels, label_name, none_label):
    # adds new variable with values from labels on the corresponding list of dates in label_dates (DONE)
    print("adding new variable in pph")
    pph[label_name] = (('time'), np.full(len(pph['time']), none_label))
    for label, dates in zip(labels, label_dates):
        #print(label) # TODO: may only be adding first 4 letters of label
        pph[label_name].loc[pph['time'].isin(dates)] = label  
    return pph

def add_reports_label(reports, label_dates, labels, label_name, none_label):
    # adds new column with values from labels on the corresponding list of dates in label_dates (DONE)
    reports[label_name] = none_label
    print("adding a new column in reports")
    for label, dates in zip(labels, label_dates):
       #print(label)
       reports[label_name].loc[reports['DATE'].isin(dates)] = label #
    return reports

def add_labels(outlooks, pph, reports, label_dates, labels, label_name, none_label):
    # adds labels, overwriting with later ones if a date has multiple labels
    return(add_outlooks_label(outlooks, label_dates, labels, label_name, none_label), 
           add_pph_label(pph, label_dates, labels, label_name, none_label),
           add_reports_label(reports, label_dates, labels, label_name, none_label))



In [6]:
new_outlooks = outlooks
new_pph = pph
new_reports = reports.drop(columns=['geometry'])

## ALWAYS RUN THROUGH HERE. THEN TO ADD MORE LABELS, RUN JUST THE LABELLING YOU WISH TO BELOW

# Subset by threshold

In [10]:
# add max threshold forecasted for valid day to each datapoint

categories = ['TSTM', 'MRGL', 'SLGT', 'ENH', 'MDT', 'HIGH']
category_dates = []
for category in categories:
    category_dates.append(identify_dates_above_threshold(outlooks, category))

(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, category_dates, categories, 'MAX_CAT', 'NONE')



adding a new column in outlooks
adding new variable in pph


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlooks[label_name].loc[outlooks['DATE'].isin(dates)] = label


adding a new column in reports


# Subset by ramp up/down amount

In [7]:
# put new_outlooks in correct order

new_outlooks['DATE_ORDER'] = 0
for index, row in new_outlooks.iterrows():
    if row['DAY'] == 3:
        new_outlooks.at[index, 'DATE_ORDER'] = row['DATE'] + '1'
    elif row['DAY'] == 1:
        new_outlooks.at[index, 'DATE_ORDER'] = row['DATE'] + '4'
    elif row['CYCLE'] == 7:
        new_outlooks.at[index, 'DATE_ORDER'] = row['DATE'] + '2'
    else:
        new_outlooks.at[index, 'DATE_ORDER'] = row['DATE'] + '3'
new_outlooks = new_outlooks.sort_values('DATE_ORDER')



In [8]:
# define and add ramp category for each datapoint. Potentially add 2 binary ramp up and ramp down (4 options are [up, down, up and down, niether]). How many forecasts to consider for each day? The day 3, both day 2, and the first day 1 (so 4 forecasts ramp)
# dictionary of category to number
category_dict = {
    None : -1,
    'NONE' : -1,
    'TSTM': 0,
    'MRGL': 1,
    'SLGT': 2,
    'ENH': 3,
    'MDT': 4,
    'HIGH': 5
}

(ramp_ups, ramp_downs, ramp_categories) = create_ramp_lists(new_outlooks, category_dict)

In [9]:
# ramp up
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(ramp_ups.values()), ramp_ups.keys(), 'RAMP_UP', 'NONE')

# ramp down
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(ramp_downs.values()), ramp_downs.keys(), 'RAMP_DOWN', 'NONE')

# ramp categories
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(ramp_categories.values()), ramp_categories.keys(), 'RAMP_CATEGORIES', 'NONE')


adding a new column in outlooks
adding new variable in pph


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlooks[label_name].loc[outlooks['DATE'].isin(dates)] = label


adding a new column in reports
adding a new column in outlooks
adding new variable in pph
adding a new column in reports
adding a new column in outlooks
adding new variable in pph


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlooks[label_name].loc[outlooks['DATE'].isin(dates)] = label


adding a new column in reports


# Subset by accurate/inaccurate

Consider forecast issue time for day 1--subset reports (and revise pph?)--look at other studies to see how they handled multiple day 1 outlooks (see the one in slack). Starting point should be use first forecasts, later ones would be a different analysis for another time. Decide if we should denote these somehow..

# Subset by season

In [14]:
# add column denoting season (4 met seasons as starting point)

seasons = ['Winter', 'Spring', 'Summer', 'Fall']
season_dates = get_season_dates(new_outlooks, seasons)
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, season_dates, seasons, 'SEASON', 'NONE')


adding a new column in outlooks
adding new variable in pph
adding a new column in reports


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlooks[label_name].loc[outlooks['DATE'].isin(dates)] = label


# Subset by region

In [15]:
# decide how to do... lower priotity, could do by state of center of highest category (or center of PPH?) Could start with super basic version
# same regions as Anderson-Frey 2016
# center is grid square with highest p_perfect prob of at least one hazard (assuming each hazard in independent)

# practically perfect probability of at least one hazard occuring near datapoint
new_pph['p_perfect_total'] = (1 - (1-new_pph['p_perfect_wind']/100)*(1-new_pph['p_perfect_hail']/100)*(1-new_pph['p_perfect_tor']/100))*100


In [75]:
# collect regions in chunks (since doing it all at once can time out)
regions = {
        'West': [],
        'Midwest': [],
        'Great Plains': [],
        'Northeast': [],
        'South': [],
        'NONE': []
    }


In [76]:
chunks = 10
time_array = pph['time']
chunk_size = math.ceil(len(time_array)/chunks)
time_arrays = [time_array[i:i + chunk_size] for i in range(0, len(time_array), chunk_size)]
for i in range(0, chunks):
    chunk_regions = create_regions(pph.sel(time = time_arrays[i]))
    for region in regions:
        regions[region] += chunk_regions[region]
    print('Added chunk ' + str(i) + ' to regions')

Finding regions for 1979
Finding regions for 1980
Finding regions for 1981
Finding regions for 1982
Finding regions for 1983
Added chunk 0 to regions
Finding regions for 1983
Finding regions for 1984
Finding regions for 1985
Finding regions for 1986
Finding regions for 1987
Added chunk 1 to regions
Finding regions for 1987
Finding regions for 1988
Finding regions for 1989
Finding regions for 1990
Finding regions for 1991
Added chunk 2 to regions
Finding regions for 1991
Finding regions for 1992
Finding regions for 1993
Finding regions for 1994
Finding regions for 1995
Added chunk 3 to regions
Finding regions for 1995
Finding regions for 1996
Finding regions for 1997
Finding regions for 1998
Finding regions for 1999
Added chunk 4 to regions
Finding regions for 1999
Finding regions for 2000
Finding regions for 2001
Finding regions for 2002
Finding regions for 2003
Added chunk 5 to regions
Finding regions for 2003
Finding regions for 2004
Finding regions for 2005
Finding regions for 2006


{'West': ['197901310000',
  '197903140000',
  '197903280000',
  '197905230000',
  '197906130000',
  '197906170000',
  '197906180000',
  '197906300000',
  '197907050000',
  '197907170000',
  '197907190000',
  '197907300000',
  '197907310000',
  '197908130000',
  '197908140000',
  '197908150000',
  '197908300000',
  '197909080000',
  '197909220000',
  '197909260000',
  '198001140000',
  '198001180000',
  '198001280000',
  '198002150000',
  '198002190000',
  '198003020000',
  '198003060000',
  '198004050000',
  '198004060000',
  '198004280000',
  '198004300000',
  '198005090000',
  '198005240000',
  '198007250000',
  '198007300000',
  '198008140000',
  '198008150000',
  '198008230000',
  '198008240000',
  '198008270000',
  '198009110000',
  '198010110000',
  '198101220000',
  '198101230000',
  '198102160000',
  '198103190000',
  '198105020000',
  '198105270000',
  '198107060000',
  '198107270000',
  '198108020000',
  '198108080000',
  '198108100000',
  '198108120000',
  '198108160000',
  

In [81]:
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(regions.values()), regions.keys(), 'REGION', 'NONE')


adding a new column in outlooks
adding new variable in pph


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlooks[label_name].loc[outlooks['DATE'].isin(dates)] = label


adding a new column in reports


# Subset by environmental data (to do later)

# ONCE YOU ADD THE LABELS YOU WANT, SAVE DATA

In [17]:
# resave labelled data
outlook_save_location = 'data/outlooks'
pph_save_location = 'data/pph'
report_save_location = 'data/storm_reports'

new_outlooks.to_file(outlook_save_location + '/labelled_outlooks.shp')
new_pph.to_netcdf(pph_save_location + '/labelled_pph.nc') # This file saved weirdly, but seems to be ok on rerun.
new_reports.to_csv(report_save_location + '/labelled_reports.csv')

In [10]:
# TODO: create master function that subsets data based on easy inputs once it's all been labelled. Put in a util file
outlooks

Unnamed: 0,ISSUE,EXPIRE,PRODISS,TYPE,DAY,THRESHOLD,CATEGORY,CYCLE,DATE,MAX_CAT,DATE_ORDER,RAMP_UP,RAMP_DOWN,SEASON,REGION,geometry
0,198701091200,198701101200,198701080719,C,2,,,7,198701090000,NONE,1987010900002,0,0,Winter,NONE,
1,198701091200,198701101200,198701081731,C,2,,,17,198701090000,NONE,1987010900003,0,0,Winter,NONE,
2,198701161200,198701171200,198701160615,C,1,TSTM,CATEGORICAL,6,198701160000,TSTM,1987011600004,0,0,Winter,NONE,"LINESTRING (-81.10300 30.60200, -81.08700 30.5..."
3,198701191200,198701201200,198701180703,C,2,,,7,198701190000,NONE,1987011900002,0,0,Winter,NONE,
4,198701221200,198701231200,198701210800,C,2,,,7,198701220000,NONE,1987012200002,0,0,Winter,South,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73061,202312311200,202401011200,202312310520,C,1,,,6,202312310000,TSTM,2023123100004,0,-1,Winter,NONE,
73062,202401011200,202401021200,202312300816,C,3,,,8,202401010000,NONE,2024010100001,0,0,Winter,NONE,
73063,202401011200,202401021200,202312310600,C,2,,,7,202401010000,NONE,2024010100002,0,0,Winter,NONE,
73064,202401011200,202401021200,202312311712,C,2,,,17,202401010000,NONE,2024010100003,0,0,Winter,NONE,
