In [1]:
from utils_filter import *
from utils_datetime import *
from utils_geography import *
from dateutil import parser
from dateutil.relativedelta import relativedelta
import math


# TODO: 201804020000's day 3 forecasts are issued on 04/30--in reality, these forecasts were probably meant for 05/02. todo is clean for this sort of thing. Potentially in cleaning file

Subsets outlooks, pph, and report data in various ways (add label to each datapoint for each method of subsetting so can be pulled out of full dataset later)


In [2]:
# read in data
data_location = 'data'
labelled = True # if starting with already labelled data
outlooks, pph, reports = read_datasets(data_location, 'labelled')

reading outlooks
reading pph
reading storm reports


In [3]:
tz_conversions = {'PST': timedelta(hours=8),
                  'MST': timedelta(hours=7),
                  'CST': timedelta(hours=6),
                  'CSt': timedelta(hours=6),
                  'CSC': timedelta(hours=6),
                  'SCT': timedelta(hours=6),
                  'EST': timedelta(hours=5),
                  'ESt': timedelta(hours=5),
                  'PDT': timedelta(hours=7),
                  'MDT': timedelta(hours=6),
                  'CDT': timedelta(hours=5),
                  'EDT': timedelta(hours=4),
                  'HST': timedelta(hours=10),
                  'SST': timedelta(hours=11),
                  'GST': timedelta(hours=10),
                  'AKS': timedelta(hours=9),
                  'AST': timedelta(hours=4),
                  'UNK': timedelta(hours=5),
                  'GMT': timedelta(0)}

def get_reports_date_strings(date_times, timezones):
    # returns list of strings of date of given datetime and timezone (where day cutoffs are 12-12 UTC) formatted as 'YYYYMMDD0000'
    for datetime, timezone, i in zip(date_times, timezones, range(len(timezones))):
        #print(datetime + ' ' + timezone[:3])
        datetime = parser.parse(datetime)
        datetime = datetime + tz_conversions[timezone[:3]]
        #print(datetime)
        if (datetime.hour < 12):
            datetime = datetime - timedelta(days = 1)
        if datetime.year > 2049:
            datetime = datetime - relativedelta(years = 100)
        datetime = datetime.strftime("%Y%m%d") + '0000'
        if i == 0:
            ret = [datetime]
        else:
            ret.append(datetime)
    return ret

def get_pph_date_strings(times):
    # returns a list of strings of given dates formatted as 'YYYYMMDD0000'
    for datetime, i in zip(times, range(len(times))):
        string = datetime.dt.strftime("%Y%m%d").values + '0000'
        if i == 0:
            ret = [string]
        else:
            ret.append(string)
    return ret

In [4]:
# add dates to reports and pph in same format as in outlooks
if labelled == False:
    reports['DATE'] = get_reports_date_strings(reports['BEGIN_DATE_TIME'], reports['CZ_TIMEZONE']) 
    pph['time'] = get_pph_date_strings(pph.time) 
    # subset outlooks into only one day 1, two day 2, and one day 3 categorical outlooks 
    # day 3: cycle not -1. day 2: cycle not -1. Day 1: cycle 6. Category: categorical or none.
    outlooks = outlooks[(((outlooks['DAY'] == 1) & (outlooks['CYCLE'] == 6)) | ((outlooks['DAY'] == 2) & (outlooks['CYCLE'] != -1)) | ((outlooks['DAY'] == 3) & (outlooks['CYCLE'] != -1)))
            & (outlooks['CATEGORY'] != 'WIND') & (outlooks['CATEGORY'] != 'HAIL') & (outlooks['CATEGORY'] != 'TORNADO')& (outlooks['CATEGORY'] != 'ANY SEVERE')& (outlooks['CATEGORY'] != 'PROB')]

    # reset incicies
    outlooks = outlooks.reset_index(drop=True)

In [5]:
def add_outlooks_label(outlooks, label_dates, labels, label_name, none_label):
    # adds new column with values from labels on the corresponding list of dates in label_dates (DONE)
    print("adding a new column in outlooks")
    outlooks[label_name] = none_label
    for label, dates in zip(labels, label_dates):
        #print(label)
        outlooks[label_name].loc[outlooks['DATE'].isin(dates)] = label
    return outlooks

def add_pph_label(pph, label_dates, labels, label_name, none_label):
    # adds new variable with values from labels on the corresponding list of dates in label_dates (DONE)
    print("adding new variable in pph")
    pph[label_name] = (('time'), np.full(len(pph['time']), none_label))
    for label, dates in zip(labels, label_dates):
        #print(label) # TODO: may only be adding first 4 letters of label
        pph[label_name].loc[pph['time'].isin(dates)] = label  
    return pph

def add_reports_label(reports, label_dates, labels, label_name, none_label):
    # adds new column with values from labels on the corresponding list of dates in label_dates (DONE)
    reports[label_name] = none_label
    print("adding a new column in reports")
    for label, dates in zip(labels, label_dates):
       #print(label)
       reports[label_name].loc[reports['DATE'].isin(dates)] = label #
    return reports

def add_labels(outlooks, pph, reports, label_dates, labels, label_name, none_label):
    # adds labels, overwriting with later ones if a date has multiple labels
    return(add_outlooks_label(outlooks, label_dates, labels, label_name, none_label), 
           add_pph_label(pph, label_dates, labels, label_name, none_label),
           add_reports_label(reports, label_dates, labels, label_name, none_label))



In [6]:
new_outlooks = outlooks
new_pph = pph
new_reports = reports.drop(columns=['geometry'])

## ALWAYS RUN THROUGH HERE. THEN TO ADD MORE LABELS, RUN JUST THE LABELLING YOU WISH TO BELOW

# Subset by threshold

In [None]:
# add max threshold forecasted for valid day to each datapoint

categories = ['TSTM', 'MRGL', 'SLGT', 'ENH', 'MDT', 'HIGH']
category_dates = []
for category in categories:
    category_dates.append(identify_dates_above_threshold(new_outlooks, category))

(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, category_dates, categories, 'MAX_CAT', 'NONE')



# Subset by ramp up/down amount

In [None]:
# put new_outlooks in correct order

# TODO: This isn't generalized. Forecast practices have changed: day 3 didn't come in until 2002. 
# Also seems like some sorts of forecasts didn't come in until '95 or '97 and that's messing with things. 
# A lot of date order 2 before then is 'none'. And some thresholds not present before then either. 
# Needs investigating (seeing when differen thresholds and date orders first happen)

new_outlooks['DATE_ORDER'] = 0
for index, row in new_outlooks.iterrows():
    if row['DAY'] == 3:
        new_outlooks.at[index, 'DATE_ORDER'] = row['DATE'] + '1'
    elif row['DAY'] == 1:
        new_outlooks.at[index, 'DATE_ORDER'] = row['DATE'] + '4'
    elif row['CYCLE'] == 7:
        new_outlooks.at[index, 'DATE_ORDER'] = row['DATE'] + '2'
    else:
        new_outlooks.at[index, 'DATE_ORDER'] = row['DATE'] + '3'
new_outlooks = new_outlooks.sort_values('DATE_ORDER')



In [None]:
def get_day_3_cutoff(outlooks):
    # returns list of dates in outlooks 
    return outlooks[outlooks['DAY'] == 3]['DATE'].min()

In [None]:
def create_ramp_lists(outlooks, category_dict):

    first_day_3 = get_day_3_cutoff(outlooks)

    ramp_ups = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: []
    }

    ramp_downs = {
        0: [],
        -1: [],
        -2: [],
        -3: [],
        -4: [],
        -5: [],
        -6: []
    }

    ramp_categories = {
        'up': [],
        'down': [],
        'both': [],
        'neither': []
    }

    old_date = '0'
    old_do = '0'
    first = True

    for index, row in outlooks.iterrows(): #iterrating through each polygon in the outlook dataset
        cat = category_dict[row['THRESHOLD']]
        do = row['DATE_ORDER']
        date = row['DATE']

        if date != old_date: # New date, save ramp up and ramp down and save alongside old date, then reset ramps, max and min categories seen, and do threshold
            
            
            if first == True:
                first = False
            else:

                if max_cat_do - min_cat_date > ramp_up:
                    ramp_up = max_cat_do - min_cat_date
                if max_cat_do - max_cat_date < ramp_down:
                    ramp_down = max_cat_do - max_cat_date

                if max_cat_do > max_cat_date:
                    max_cat_date = max_cat_do
                if max_cat_do < min_cat_date:
                    min_cat_date = max_cat_do

                ramp_ups[ramp_up].append(old_date)
                ramp_downs[ramp_down].append(old_date)
                if ramp_up > 0 and ramp_down < 0:
                    ramp_categories['both'].append(old_date)
                elif ramp_up > 0:
                    ramp_categories['up'].append(old_date)
                elif ramp_down < 0:
                    ramp_categories['down'].append(old_date)
                else:
                    ramp_categories['neither'].append(old_date)

            old_date = date
            old_do = do
            
            ramp_down = 0
            ramp_up = 0
            max_cat_date = -1
            
            # TODO: consider other changes to forecast practices
            if do[-1] == '1' and date > first_day_3:
                min_cat_date = 5 
            elif do[-1] == '2': 
                min_cat_date = 5
            else: # First outlook for this date is not day 3, so day 3 had no outlook.
                min_cat_date = -1


            max_cat_do = cat


        elif do != old_do: # new outlook, update min and max categories seen, ramp value
            if max_cat_do - min_cat_date > ramp_up:
                ramp_up = max_cat_do - min_cat_date
            if max_cat_do - max_cat_date < ramp_down:
                ramp_down = max_cat_do - max_cat_date

            if max_cat_do > max_cat_date:
                max_cat_date = max_cat_do
            if max_cat_do < min_cat_date:
                min_cat_date = max_cat_do

            old_do = do

            max_cat_do = cat

        else: # Just another threshold within the same polygon
            if cat > max_cat_do:
                max_cat_do = cat
            
        
    # for last iteration
    ramp_ups[ramp_up].append(old_date)
    ramp_downs[ramp_down].append(old_date)
    if ramp_up > 0 and ramp_down < 0:
        ramp_categories['both'].append(old_date)
    elif ramp_up > 0:
        ramp_categories['up'].append(old_date)
    elif ramp_down < 0:
        ramp_categories['down'].append(old_date)
    else:
        ramp_categories['neither'].append(old_date)

    return(ramp_ups, ramp_downs, ramp_categories)


In [None]:
# define and add ramp category for each datapoint. Potentially add 2 binary ramp up and ramp down (4 options are [up, down, up and down, niether]). How many forecasts to consider for each day? The day 3, both day 2, and the first day 1 (so 4 forecasts ramp)
# dictionary of category to number
category_dict = {
    None : -1,
    'NONE' : -1,
    'TSTM': 0,
    'MRGL': 1,
    'SLGT': 2,
    'ENH': 3,
    'MDT': 4,
    'HIGH': 5
}

(ramp_ups, ramp_downs, ramp_categories) = create_ramp_lists(new_outlooks, category_dict)

In [7]:
new_outlooks

Unnamed: 0,ISSUE,EXPIRE,PRODISS,TYPE,DAY,THRESHOLD,CATEGORY,CYCLE,DATE,MAX_CAT,DATE_ORDER,RAMP_UP,RAMP_DOWN,RAMP_CATEG,SEASON,REGION,geometry
0,198701091200,198701101200,198701080719,C,2,,,7,198701090000,NONE,1987010900002,0,0,neither,Winter,NONE,
1,198701091200,198701101200,198701081731,C,2,,,17,198701090000,NONE,1987010900003,0,0,neither,Winter,NONE,
2,198701161200,198701171200,198701160615,C,1,TSTM,CATEGORICAL,6,198701160000,TSTM,1987011600004,0,0,neither,Winter,NONE,"LINESTRING (-81.10300 30.60200, -81.08700 30.5..."
3,198701191200,198701201200,198701180703,C,2,,,7,198701190000,NONE,1987011900002,0,0,neither,Winter,NONE,
4,198701221200,198701231200,198701210800,C,2,,,7,198701220000,NONE,1987012200002,0,0,neither,Winter,South,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73061,202312311200,202401011200,202312310520,C,1,,,6,202312310000,TSTM,2023123100004,0,-1,down,xxxxxx,NONE,
73062,202401011200,202401021200,202312300816,C,3,,,8,202401010000,NONE,2024010100001,0,0,neither,xxxxxx,NONE,
73063,202401011200,202401021200,202312310600,C,2,,,7,202401010000,NONE,2024010100002,0,0,neither,xxxxxx,NONE,
73064,202401011200,202401021200,202312311712,C,2,,,17,202401010000,NONE,2024010100003,0,0,neither,xxxxxx,NONE,


In [None]:

# ramp up
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(ramp_ups.values()), ramp_ups.keys(), 'RAMP_UP', 0)

# ramp down
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(ramp_downs.values()), ramp_downs.keys(), 'RAMP_DOWN', 0)

# ramp categories
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(ramp_categories.values()), ramp_categories.keys(), 'RAMP_CATEGORIES', 'neither')


# Subset by accurate/inaccurate

Consider forecast issue time for day 1--subset reports (and revise pph?)--look at other studies to see how they handled multiple day 1 outlooks (see the one in slack). Starting point should be use first forecasts, later ones would be a different analysis for another time. Decide if we should denote these somehow..

# Subset by season

In [None]:
def get_season_dates(pph):
    dates = list(set(pph['time'].values))
    season_dates = [[], [], [], []]
    for date in dates:
        month = int(date[4:6])
        if month == 12 or month < 3:
            season_dates[0].append(date)
        elif month < 6:
            season_dates[1].append(date)
        elif month < 9:
            season_dates[2].append(date)
        else:
            season_dates[3].append(date)
    return season_dates

In [None]:
# add column denoting season (4 met seasons as starting point)

seasons = ['Winter', 'Spring', 'Summer', 'Fall']
season_dates = get_season_dates(new_pph)
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, season_dates, seasons, 'SEASON', 'xxxxxx')


# Subset by region

In [None]:
# decide how to do... lower priotity, could do by state of center of highest category (or center of PPH?) Could start with super basic version
# same regions as Anderson-Frey 2016
# center is grid square with highest p_perfect prob of at least one hazard (assuming each hazard in independent)

# practically perfect probability of at least one hazard occuring near datapoint
new_pph['p_perfect_total'] = (1 - (1-new_pph['p_perfect_wind']/100)*(1-new_pph['p_perfect_hail']/100)*(1-new_pph['p_perfect_tor']/100))*100


In [None]:
# collect regions in chunks (since doing it all at once can time out)
regions = {
        'West': [],
        'Midwest': [],
        'Great Plains': [],
        'Northeast': [],
        'South': [],
        'NONE': []
    }


In [None]:
def get_state(lat, lon, geolocator):
    location = geolocator.reverse(str(lat)+","+str(lon))
    if location == None:
        return None
    address = location.raw['address']
    state = address.get('state', '')
    return state

def get_region(lat, lon, west_threshold_co_nm, regions_dict, geolocator):
    state = get_state(lat, lon, geolocator)
    if state == 'Colorado' or state == 'New Mexico':
        if lon < west_threshold_co_nm:
            return('West')
        else:
            return('Great Plains')
    for region in regions_dict:
        if state in regions_dict[region]:
            return region
    # Cases where highest PPH is out of contiguous states, usually just outside bc nearest gridpoint is on other side of border
    if lat > 38:
        if lon > -80.5:
            return('Northeast')
        elif lon > -104:
            return('Great Plains')
        else:
            return('West')
    else:
        if lon > -93.8:
            return('South')
        elif lon > -106.5:
            return('Great Plains')
        else:
            return('West')
    return('NONE')


def create_regions(pph):
    regions = {
        'West': [],
        'Midwest': [],
        'Great Plains': [],
        'Northeast': [],
        'South': [],
        'NONE': []
    }

    geolocator = Nominatim(user_agent="severe_thunderstorm_miles")
    west_threshold_co_nm = -105
    regions_dict = { # list of states fully within each region (doesn't include AK, HI, CO, NM)
        'West': ['Washington', 'Oregon', 'California', 'Idaho', 'Montana', 'Wyoming', 'Utah', 'Arizona'],
        'Midwest': ['North Dakota', 'South Dakota', 'Minnesota', 'Iowa', 'Wisconsin', 'Illinois', 'Michigan', 'Indiana', 'Ohio', 'Kentucky'],
        'Great Plains': ['Nebraska', 'Kansas', 'Oklahoma', 'Texas', 'Missouri'],
        'Northeast': ['Maine', 'Vermont', 'New Hampshire', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey', 'Delaware', 'Maryland', 'District of Columbia', 'West Virginia'],
        'South': ['Virginia', 'Arkansas', 'Louisiana', 'Tennessee', 'Mississippi', 'Alabama', 'Georgia', 'North Carolina', 'South Carolina', 'Florida']
    }

    old_year = ''
    for date, date_pph in pph.groupby('time'):
        if date_pph['p_perfect_total'].max() > 0:
            year = date[0:4]
            if year != old_year:
                print("Finding regions for " + year)
                old_year = year
            max_coords = date_pph['p_perfect_total'].argmax(dim = ['x', 'y'])
            max_x_coord = max_coords['x'].values
            max_y_coord = max_coords['y'].values
            lat = date_pph['lat'].loc[dict(x = max_x_coord, y = max_y_coord)].values
            lon = date_pph['lon'].loc[dict(x = max_x_coord, y = max_y_coord)].values
            region = get_region(lat, lon, west_threshold_co_nm, regions_dict, geolocator)
            regions[region].append(date)
            
    return(regions)

In [None]:
chunks = 10
time_array = new_pph['time']
chunk_size = math.ceil(len(time_array)/chunks)
time_arrays = [time_array[i:i + chunk_size] for i in range(0, len(time_array), chunk_size)]
for i in range(0, chunks):
    chunk_regions = create_regions(new_pph.sel(time = time_arrays[i]))
    for region in regions:
        regions[region] += chunk_regions[region]
    print('Added chunk ' + str(i) + ' to regions')

In [None]:
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(regions.values()), regions.keys(), 'REGION', 'NONE')

# Subset by environmental data (to do later)

# Label by max total pph bin and count of storm reports (very rudamentary methods of accuracy)

In [18]:
# max p_perfect_total
max_total_pph = { # TODO: see if categories or cutoffs are correct here from SPC
    'HIGH': [],
    'MDT': [],
    'ENH': [],
    'SLGT': [],
    'MRGL': [],
    'TSTM': [],
    'ZERO': []
}

for date, date_pph in pph.groupby('time'):
    max_pph = date_pph['p_perfect_total'].max()
    if max_pph == 0:
        max_total_pph['ZERO'].append(date)
    elif max_pph < 5:
        max_total_pph['TSTM'].append(date)
    elif max_pph < 15:
        max_total_pph['MRGL'].append(date)
    elif max_pph < 30:
        max_total_pph['SLGT'].append(date)
    elif max_pph < 45:
        max_total_pph['ENH'].append(date)
    elif max_pph < 60:
        max_total_pph['MDT'].append(date)
    else:
        max_total_pph['HIGH'].append(date)
    
(new_outlooks, new_pph, new_reports) = add_labels(new_outlooks, new_pph, new_reports, list(max_total_pph.values()), max_total_pph.keys(), 'MAX_PPH', 'NONE')

# TODO: also do numerically?

{'HIGH': ['197904100000',
  '197905020000',
  '197906190000',
  '197906280000',
  '198004020000',
  '198004070000',
  '198004080000',
  '198005290000',
  '198006020000',
  '198006060000',
  '198006070000',
  '198006140000',
  '198007150000',
  '198104130000',
  '198105090000',
  '198105130000',
  '198105170000',
  '198106130000',
  '198106230000',
  '198204020000',
  '198204160000',
  '198204190000',
  '198204250000',
  '198204260000',
  '198206080000',
  '198206140000',
  '198206150000',
  '198304010000',
  '198305020000',
  '198305130000',
  '198305200000',
  '198306050000',
  '198306120000',
  '198306270000',
  '198307030000',
  '198307190000',
  '198311220000',
  '198403150000',
  '198403270000',
  '198403280000',
  '198404200000',
  '198404260000',
  '198404290000',
  '198405020000',
  '198405030000',
  '198405070000',
  '198406070000',
  '198406080000',
  '198406110000',
  '198411090000',
  '198503280000',
  '198504050000',
  '198504200000',
  '198504300000',
  '198505310000',
  

In [55]:
# num of storm reports 
# should be pretty trivial. Look up function
full_dates = reports['DATE']
dates = sorted(full_dates.unique())
date_counts = np.zeros(len(dates))
date_index = 0
for i, date in zip(range(len(full_dates)), full_dates):
    print('date ' + date)
    print('prev_date ' + dates[date_index])
    while date != dates[date_index]:
        date_index += 1
    date_counts[date_index] = date_counts[date_index] + 1

date_counts
    
    

date 195004280000
prev_date 195001030000
date 195004290000
prev_date 195004280000
date 195007050000
prev_date 195004290000
date 195007050000
prev_date 195007050000
date 195007240000
prev_date 195007050000
date 195008290000
prev_date 195007240000
date 195011040000
prev_date 195008290000
date 195011040000
prev_date 195011040000
date 195009150000
prev_date 195011040000


IndexError: list index out of range

In [35]:
dates

# ONCE YOU ADD THE LABELS YOU WANT, SAVE DATA

In [None]:
# resave labelled data
outlook_save_location = 'data/outlooks'
pph_save_location = 'data/pph'
report_save_location = 'data/storm_reports'

new_outlooks.to_file(outlook_save_location + '/labelled_outlooks.shp')
new_pph.to_netcdf(pph_save_location + '/labelled_pph.nc')
new_reports.to_csv(report_save_location + '/labelled_reports.csv')

In [None]:
# TODO: create master function that subsets data based on easy inputs once it's all been labelled. Put in a util file?