# 03 - Extract GSDR

This notebook: 
- Extracts landslide triggering rainfall for each landslide from each nearby gauge from the GSDR
- Extracts annual block maxima at a range of durations from each nearby gauge

The outputs of this notebook are:
- lsdata_gsdr_rain.csv, a csv with event rainfall (3 hour dry and 48 hour dry periods) from each nearby gauge
- annual_block_maxima.csv, a csv with the annual block maxima at a range of durations from each nearby gauge 

These outputs are read into 05_CombinePrep


**Data required**

Processed data: ls_get_gsdr_simple.csv

Original data: Global Sub-Daily Rainfall Dataset (GSDR).  The publicly available portion of the GSDR is available from the authors upon request.

Lewis, E. et al. GSDR: A Global Sub-Daily Rainfall Dataset. Journal of Climate 32, 4715–4729 (2019).

Lewis, E. et al. Quality control of a global hourly rainfall dataset. Environmental Modelling & Software 144, 105169 (2021).

*This notebook requires the intense python package, which handles the GSDR data.  This is available with the public GSDR data upon request from the authors.*

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import intense
from timezonefinder import TimezoneFinder
import datetime
import warnings
import zipfile
import matplotlib.pyplot as plt

**set directories for data and results directories here**

*note, the two output files from are named 'lsdata_gsdr_rain.csv' and 'annual_block_maxima.csv', they will be saved in the resultsdir.*
*this script will also create many of text files with the extracted intense data from the zip files.  These can safely be deleted after the run.*

In [None]:
#set directory where landslide data is here
datadir = ''

#set directory where outputs should be saved here
resultsdir = ''

In [None]:
#set directory where GSDR data lives here

#gsdrdir = datadir + '20220615_GSDR/GSDR-20220615T144339Z-001/GSDR/Raw data in intense format/'
gsdrdir = datadir + '20220615_GSDR/GSDR-20220615T144339Z-001/GSDR/QC_d data/'

In [None]:
#set mode, whether to run on the publicly available GSDR data or on all GSDR data
#mode = 'public' 
mode = 'all' 

In [None]:
#read geopandas dataframe with landslides and the gauges that should be accessed for each one

lsdata = pd.read_csv(datadir + 'ls_get_gsdr_simple.csv')

lsdata['date_local_midnight_utc'] = pd.to_datetime(lsdata['date_local_midnight_utc'])
lsdata['StartDate'] = pd.to_datetime(lsdata['StartDate'], utc = True)
lsdata['EndDate'] = pd.to_datetime(lsdata['EndDate'], utc = True)

lsdata.drop(['Unnamed: 0'], axis = 1, inplace = True)

### For each landslide, read the gauge data, extract rainfall metrics

In [None]:
def prep_ts(lspt, gsdrdir, daysbefore = 90, daysafter = 1):
    
    """
    Function to read the precipitation time series from the gauge associated with the landslide point,
    convert it to UTC, interpolate missing values, and subset to the specified time interval before
    and after the landslide point
    
    lspt = row of a dataframe that contains columns: 
        - 'Folder' - folder containing the data (e.g. US.zip)
        - 'date_local_midnight_utc' - midnight local time on the day the landslide occurred, converted to UTC
    
    gsdrdir = directory containing the GSDR subfolders
    daysbefore = how many days before the landslide should data be extracted?
    daysafter = how many days after midnight on the day of the landslide should be extracted? 
    
    Returns: 
    ts_sub = original time series, subset
    ts_fill = original time series, subset and interpolated   
    
    """
    
    #get the filename for the station we need
    
    zipfn = gsdrdir + lspt['Folder']
    
    z = zipfile.ZipFile(zipfn, "r")

    tsfn = z.extract(lspt['OriginalID'] + '.txt')
    
    s = intense.readIntense(tsfn)

    #find the right time zone for the gauge 

    #initiate timezone finder

    tf = TimezoneFinder()

    if s.time_zone == 'UTC': 
        stz = 'UTC'
        #time zone aware in UTC
        ts = s.data.tz_localize(stz, 
                            ambiguous=np.zeros(len(s.data), dtype = bool), #if it's ambiguous, assign to standard time
                           nonexistent ='shift_forward') #if it doesn't exist, shift to next time forward


    else:
        stz = tf.timezone_at(lng=s.longitude, lat=s.latitude)
         #first, localize to local time zone
        ts = s.data.tz_localize(stz, 
                                ambiguous=np.zeros(len(s.data), dtype = bool), 
                                nonexistent ='shift_forward')
        #then, convert to UTC

        ts = ts.tz_convert('UTC')


    #subset to specified number of days before and after the landslide
    lsdate = lspt['date_local_midnight_utc']
    
    #check that these dates are in the range of the time series
    
    start_ts = lsdate - datetime.timedelta(days = daysbefore)
    end_ts = lsdate + datetime.timedelta(days = daysafter + 1)
    
    if (start_ts > lspt['StartDate']) & (end_ts < lspt['EndDate']):

        ts_sub =  ts.loc[(lsdate - datetime.timedelta(days = daysbefore)):(lsdate + datetime.timedelta(days = daysafter + 1))]

        #sort index, just in case for some reason it wasn't already in sorted, or pandas thinks it isn't
        ts_sub.sort_index(axis = 0, inplace = True)

        #interpolate to fill nans in the time series using the pandas built in interpolation method

        ts_fill = ts_sub.interpolate(method = 'time')

        #remove any duplicates by keeping the first duplicated entry only
        #CAUTION: this could create some problems where we miss an hour of rainfall, but I don't know 
        #how else to deal with it at this point
        ts_fill = ts_fill[~ts_fill.index.duplicated()]
    
    else: #case that the time series can't be extracted because time series is not completely covered
        ts_sub = np.array([])
        ts_fill = np.array([])
    
    return ts_sub, ts_fill

In [None]:
def triggering_rain(nhdry, ts, ts_raw, lsdate): 
    
    """
    Gets length and sum of triggering rainfall event from a dry period of length nhdry to the peak hourly
    rainfall and the end of the landslide day
    
    nhdry = length of dry period preceeding landslide in hours
    ts = time series with hourly datetime index and values of hourly precip
    ts_raw = original time series to check how many nans were interpolated in the time range
    lsdate = datetime corresponding to the start of the landslide day  
    
    """
    
    #if the time series is completely empty
    
    if len(ts) == 0:
        
        idxtrst = np.nan
        htrsttopk = np.nan
        cptrsttopk = np.nan
        htrsttoeod = np.nan
        cptrsttoeod = np.nan
        mwtopk = np.nan
        mxhr = np.nan
        idxmxhr = np.nan
        nnantopk = np.nan
        nnantoeod = np.nan
    
    else: 
        
        #find peak hourly rain on the day of the landslide 

        mxhr = ts.loc[lsdate:lsdate+datetime.timedelta(hours = 24)].max()
       

        #if peak hourly rain is nan (e.g. there's no data on the day of the landslide, then)

        if pd.isnull(mxhr):
            
            
            idxmxhr = np.nan
            mxhr = np.nan
            idxtrst = np.nan
            htrsttopk = np.nan
            cptrsttopk = np.nan
            htrsttoeod = np.nan
            cptrsttoeod = np.nan
            mwtopk = np.nan
            nnantopk = np.nan
            nnantoeod = np.nan
            

        else:
            
            #find out when the peak occurred
            idxmxhr = ts.loc[lsdate:lsdate+datetime.timedelta(hours = 24)].idxmax()


            #moving window summing, index is last observation in window (eg. 10 am is 8am + 9am + 10 am)
            mw = ts.rolling(nhdry).sum() 

            #subset moving window time series up to the peak 
            mwtopk = mw.loc[:idxmxhr]

            # find where the nhdry hourly sum is less than 0.01 mm (essentially 0, but catch very small values)
            #, find the nearest index to the peak point (iloc type index)
            ix = mwtopk[mwtopk<0.01].index.get_indexer([idxmxhr], method = 'pad')[0]

            #if we can't find a dry enough period in the time series:
            if ix == -1:

                idxtrst = np.nan
                htrsttopk = np.nan
                cptrsttopk = np.nan
                htrsttoeod = np.nan
                cptrsttoeod = np.nan
                mwtopk = np.nan
                nnantopk = np.nan
                nnantoeod = np.nan
                

            else:

                #get the time stamp index of the start of the triggering rainfall 
                idxtrst = mwtopk[mwtopk<0.01].index[ix] + datetime.timedelta(hours = 1)

                #how long did it rain between the start of the triggering rain and the peak (inclusive)? 

                htrsttopk = ts.loc[idxtrst:idxmxhr].count()

                #how much did it rain between the start of the triggering rain and the peak (inclusive)?
                cptrsttopk = ts.loc[idxtrst:idxmxhr].sum()

                #how long did it rain between the start of the triggering rain and the end of the landslide day? 

                htrsttoeod = ts.loc[idxtrst:lsdate+datetime.timedelta(hours = 24)].count()

                #how much did it rain between the start of the triggering rain and the end of the landslide day?
                cptrsttoeod = ts.loc[idxtrst:lsdate+datetime.timedelta(hours = 24)].sum()
                
                
                #check what percent of the original time series leading up to the peak is nans
                
                nnantopk = ts_raw.loc[idxtrst:idxmxhr].isna().sum()
                
                #check what percent of the original time series leading up to the end of day is nans
                
                nnantoeod = ts_raw.loc[idxtrst:lsdate+datetime.timedelta(hours = 24)].isna().sum()
                             
                
                
                

    return [idxmxhr, 
            mxhr, 
            idxtrst, 
            htrsttopk, 
            cptrsttopk, 
            htrsttoeod, 
            cptrsttoeod, 
            mwtopk, 
            nnantopk,
            nnantoeod]



In [None]:
def get_antecedent(ts, idxdt, h):
    
    """
    get cumulative antecedent precipitation before some hour (e.g. peak precip on landslide day 
    or before start of triggering rainfall)
    
    ts = time series
    idxdt = datetime that you want antecedent for
    h = number of antecedent hours
    
    """
    #for the case that there no start to the triggering rainfall was found
    if pd.isnull(idxdt):
        ante = np.nan
        
    #for the case that the time series is empty
    elif len(ts) == 0:
        ante = np.nan
    
    else:
        
        ante = ts.loc[idxdt - datetime.timedelta(hours = h):idxdt].sum()
    
    return ante

In [None]:
def get_rainmetrics(lspt, gsdrdir):
    
    """
    get info about triggering, event, and antecedent rainfall for a landslide
    
    lspt - a dataframe row from lsdata
    
    """
    print(lspt.name)
    
    [ts, ts_fill] = prep_ts(lspt, gsdrdir, daysbefore = 90, daysafter = 1)
    
    if ts is None: 
        
        rd = None
        
    else:
    
        lsdate = lspt['date_local_midnight_utc']

        #then, get triggering rainfall metrics

        #define triggering event as 3 hours dry to peak or end of day
        [idxmxhr, mxhr, idxtrst, 
         htrsttopk, cptrsttopk, htrsttoeod, cptrsttoeod, _, nnantrsttopk, nnantrsttoeod] = triggering_rain(nhdry = 3,
                                                                                                   ts = ts_fill,
                                                                                                    ts_raw = ts,
                                                                                                   lsdate = lsdate)

        #get event metrics 

        #define event as 48 hours dry to peak or end of day
        [_, _, idxest, 
         hesttopk, cpesttopk, hesttoeod, cpesttoeod, _, nnanesttopk, nnanesttoeod] = triggering_rain(nhdry = 48,
                                                                                ts = ts_fill,
                                                                                ts_raw = ts,
                                                                                lsdate = lsdate)


        #get antecedent metrics 

        #24 hours before start of triggering rain 

        ante24htrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24)

        #7 day before start of triggering rain 
        ante7dtrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24*7)

        #14 day before start of triggering rain 
        ante14dtrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24*14)

        #21 day before start of triggering rain 
        ante21dtrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24*21)

        #28 day before start of triggering rain
        ante28dtrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24*28)


        #get estimated exceedance probabilities of the hourly maximum intensity

       # mxhrexprob = get_exceedanceprob(ts, mxhr, 10) #minimum 10 years on record


        #put everything into a dictionary to return 


        rd = {"event_id":lspt.name, 
             "tr_start":idxtrst,
             "tr_tpk":idxmxhr,
             "tr_ppk":mxhr,
             "tr_htopk":htrsttopk, 
             "tr_cptopk":cptrsttopk, 
             "tr_htoeod":htrsttoeod, 
             "tr_cptoeod":cptrsttoeod,
             "tr_nnantopk": nnantrsttopk, 
             "tr_nnan_toeod": nnantrsttoeod,
             "e_start":idxest, 
             "e_htopk":hesttopk, 
             "e_cptopk":cpesttopk, 
             "e_htoeod":hesttoeod, 
             "e_cptoeod":cpesttoeod,
             "e_nnantopk":nnanesttopk,
             "e_nnantoeod":nnanesttoeod,               
             "tr_ante24h":ante24htrst, 
             "tr_ante7d":ante7dtrst, 
             "tr_ante14d":ante14dtrst, 
             "tr_ante21d":ante21dtrst, 
             "tr_ante28d":ante28dtrst}

    print(lspt.name)

    return rd

### Get GSDR data

In [None]:
#get the landslide data for which we already have data in the public dataset for now 

def publicGSDR(lspt):
    
    folders = ['Belgium.zip', 
               'Finland.zip', 
               'Germany.zip', 
               'Ireland.zip', 
               'ISD.zip',
               'Japan.zip', 
               'Norway.zip', 
               'UK.zip', 
               'US.zip']
    
    public = lspt['Folder'] in folders 
    
    return public


In [None]:
if mode == 'public':
    lsdata['public'] = lsdata.apply(lambda lspt:publicGSDR(lspt), axis = 1)
    lsdata_gsdr = lsdata[lsdata['public']].copy()
    
else: 
    lsdata_gsdr = lsdata.copy()

In [None]:
#sample a subset of points for testing
#lsdata_gsdr = lsdata_gsdr.sample(20, axis = 0).copy()

In [None]:
len(lsdata_gsdr)

In [None]:
with warnings.catch_warnings(record = True):
    lsdata_gsdr['rain_metrics'] = lsdata_gsdr.apply(lambda lspt:get_rainmetrics(lspt, gsdrdir), 
                                             axis = 1)

In [None]:
joindf = pd.DataFrame(list(lsdata_gsdr['rain_metrics'].values))

lsdata_gsdr_rain = pd.concat([lsdata_gsdr, 
             joindf.set_index(lsdata_gsdr.index)], 
             axis = 1)

lsdata_gsdr_rain.drop(['rain_metrics'], axis = 1, inplace = True)


In [None]:
#strip city names of white space, special characters, etc for R

lsdata_gsdr_rain['city'] = lsdata_gsdr_rain.apply(lambda row:''.join(e for e in row['UC_NM_MN'] if e.isalnum()), 
                                       axis = 1)



In [None]:
lsdata_gsdr_rain.to_csv(resultsdir + 'lsdata_gsdr_rain.csv')

In [None]:
#lsdata_gsdr_rain = pd.read_csv(resultsdir + 'lsdata_gsdr_rain.csv')

In [None]:
lsdata_gsdr_rain.reset_index(drop = True, inplace = True)

### Get annual block maxima from the gauges associated with landslides

In [None]:
def prep_whole_ts(station, gsdrdir):
    
    """
    Function to read the precipitation time series from a gauge, convert it to UTC,
    interpolate missing values
    
    station = row of a dataframe that contains columns: 
        - 'Folder' - folder containing the data (e.g. US.zip)
        - 'OriginalID' - name of station
    gsdrdir = directory containing the GSDR subfolders
    
    
    Returns: 
    ts = original time series
    ts_fill = original time series, interpolated   
    
    """
    
    #get the filename for the station we need
    
    zipfn = gsdrdir + station['Folder']
    
    z = zipfile.ZipFile(zipfn, "r")

    tsfn = z.extract(station['OriginalID'] + '.txt')
    
    s = intense.readIntense(tsfn)

    #find the right time zone for the gauge 

    #initiate timezone finder

    tf = TimezoneFinder()

    if s.time_zone == 'UTC': 
        stz = 'UTC'
        #time zone aware in UTC
        ts = s.data.tz_localize(stz, 
                            ambiguous=np.zeros(len(s.data), dtype = bool), #if it's ambiguous, assign to standard time
                           nonexistent ='shift_forward') #if it doesn't exist, shift to next time forward


    else:
        stz = tf.timezone_at(lng=s.longitude, lat=s.latitude)
         #first, localize to local time zone
        ts = s.data.tz_localize(stz, 
                                ambiguous=np.zeros(len(s.data), dtype = bool), 
                                nonexistent ='shift_forward')
        #then, convert to UTC

        ts = ts.tz_convert('UTC')


    #sort index, just in case for some reason it wasn't already in sorted, or pandas thinks it isn't
    ts.sort_index(axis = 0, inplace = True)
    
    #interpolate to fill nans in the time series using the pandas built in interpolation method

    ts_fill = ts.interpolate(method = 'time')

    #remove any duplicates by keeping the first duplicated entry only
    #CAUTION: this could create some problems where we miss an hour of rainfall, but I don't know 
    #how else to deal with it at this point
    ts_fill = ts_fill[~ts_fill.index.duplicated()]

    return ts, ts_fill

In [None]:
uniquestations = lsdata_gsdr_rain['NewID'].unique() #stations with a landslide associated with it

In [None]:
durations = [1, 3, 6, 12, 24, 48, 100, 200, 500, 1000] #durations to extract block maxima for

In [None]:
len(uniquestations)

In [None]:
with warnings.catch_warnings(record = True):
#loop through all stations associated with a landslide and extract annual block maxima at a range of durations

    #loop over stations

    for s in range(len(uniquestations)):
        
        print(s)

        station = lsdata_gsdr_rain[lsdata_gsdr_rain['NewID'] == uniquestations[s]].iloc[0][['ID_HDC_G0', 
                                                                                          'UC_NM_MN', 
                                                                                          'Folder', 
                                                                                          'OriginalID', 
                                                                                           'NewID', 
                                                                                          'Latitude',
                                                                                          'Longitude',
                                                                                          'Recordlength(hours)',
                                                                                          'Recordlength(years)',
                                                                                          'StartDate',
                                                                                          'EndDate',
                                                                                          'Missingdata(%)',
                                                                                          'geometry_y']]
        #get time series

        ts, ts_fill = prep_whole_ts(station, gsdrdir)

        #extract block maxima at a range of durations

        for d in range(len(durations)):

           #moving window on the raw time series
            mw = ts.rolling(durations[d]).sum() #right index is sum of previous d observations. 
            #if there are nans within the window, returns nan

            ann_block_max = mw.resample('Y').max() #take annual max. ignores nans (could be missing data, 
            #will still get max)


            #moving window on the interpolated time series

            mw_fill = ts_fill.rolling(durations[d]).sum() #right index is sum of previous d observations. 
            #if there are nans within the window, returns nan

            ann_block_max_fill = mw_fill.resample('Y').max() #take annual max. ignores nans (could be missing data, 
            #will still get max)


            block_max_df = pd.DataFrame(ann_block_max, columns = ['raw_block_max'])
            block_max_df['fill_block_max'] = ann_block_max_fill.values

            #record the number of non-nan observations in the moving window (all other hours in the year are then nan)
            block_max_df['raw_notnainmw'] = mw.resample('Y').count().values

            #record the number of non-nan observations in the raw time series (all other hours in the year are then nan)
            block_max_df['raw_notnaints'] = ts.resample('Y').count().values


            block_max_df['duration(h)'] = durations[d]
            block_max_df['year'] = block_max_df.index.year


            if d == 0:

                stationdf = block_max_df.copy() 



            else:

                stationdf = pd.concat([stationdf, block_max_df])


        #assign information about the station
        stationinfo = pd.DataFrame([station])

        stationinfo = stationinfo.loc[stationinfo.index.repeat(len(stationdf))]

        stationinfo.set_index(stationdf.index, inplace = True)

        stationdf = pd.concat([stationinfo, stationdf], axis = 1)

        if s == 0: 

            maindf = stationdf.copy()

        else: 

            maindf = pd.concat([maindf, stationdf])
            
        print(s)

In [None]:
#strip city names of white space, special characters, etc for R

maindf['city'] = maindf.apply(lambda row:''.join(e for e in row['UC_NM_MN'] if e.isalnum()), 
                                       axis = 1)


In [None]:
maindf.to_csv(resultsdir + 'annual_block_maxima.csv')

In [None]:
#maindf = pd.read_csv(resultsdir + 'annual_block_maxima.csv')