# 03 - Extract GSDR

This notebook: 
- Extracts landslide triggering rainfall for each landslide from each nearby gauge from the GSDR
- Extracts annual block maxima at a range of durations from each nearby gauge

The outputs of this notebook are:
- lsdata_gsdr_rain.csv, a csv with event rainfall (3 hour dry and 48 hour dry periods) from each nearby gauge
- annual_block_maxima.csv, a csv with the annual block maxima at a range of durations from each nearby gauge 

These outputs are read into 05_CombinePrep


**Data required**

Processed data: ls_get_gsdr_simple.csv

Original data: Global Sub-Daily Rainfall Dataset (GSDR).  The publicly available portion of the GSDR is available from the authors upon request.

Lewis, E. et al. GSDR: A Global Sub-Daily Rainfall Dataset. Journal of Climate 32, 4715–4729 (2019).

Lewis, E. et al. Quality control of a global hourly rainfall dataset. Environmental Modelling & Software 144, 105169 (2021).

*This notebook requires the intense python package, which handles the GSDR data.  This is available with the public GSDR data upon request from the authors.*

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import intense
from timezonefinder import TimezoneFinder
import datetime
import warnings
import zipfile
import matplotlib.pyplot as plt

**set directories for data and results directories here**

*note, the two output files from are named 'lsdata_gsdr_rain.csv' and 'annual_block_maxima.csv', they will be saved in the resultsdir.*
*this script will also create many of text files with the extracted intense data from the zip files.  These can safely be deleted after the run.*

In [11]:
#set directory where landslide data is here
datadir = ''

#set directory where outputs should be saved here
resultsdir = ''

In [12]:
#set directory where GSDR data lives here

#gsdrdir = datadir + '20220615_GSDR/GSDR-20220615T144339Z-001/GSDR/Raw data in intense format/'
gsdrdir = datadir + '20220615_GSDR/GSDR-20220615T144339Z-001/GSDR/QC_d data/'

In [13]:
#set mode, whether to run on the publicly available GSDR data or on all GSDR data
#mode = 'public' 
mode = 'all' 

In [14]:
#read geopandas dataframe with landslides and the gauges that should be accessed for each one

lsdata = pd.read_csv(datadir + 'ls_get_gsdr_simple.csv')

lsdata['date_local_midnight_utc'] = pd.to_datetime(lsdata['date_local_midnight_utc'])
lsdata['StartDate'] = pd.to_datetime(lsdata['StartDate'], utc = True)
lsdata['EndDate'] = pd.to_datetime(lsdata['EndDate'], utc = True)

lsdata.drop(['Unnamed: 0'], axis = 1, inplace = True)

### For each landslide, read the gauge data, extract rainfall metrics

In [7]:
def prep_ts(lspt, gsdrdir, daysbefore = 90, daysafter = 1):
    
    """
    Function to read the precipitation time series from the gauge associated with the landslide point,
    convert it to UTC, interpolate missing values, and subset to the specified time interval before
    and after the landslide point
    
    lspt = row of a dataframe that contains columns: 
        - 'Folder' - folder containing the data (e.g. US.zip)
        - 'date_local_midnight_utc' - midnight local time on the day the landslide occurred, converted to UTC
    
    gsdrdir = directory containing the GSDR subfolders
    daysbefore = how many days before the landslide should data be extracted?
    daysafter = how many days after midnight on the day of the landslide should be extracted? 
    
    Returns: 
    ts_sub = original time series, subset
    ts_fill = original time series, subset and interpolated   
    
    """
    
    #get the filename for the station we need
    
    zipfn = gsdrdir + lspt['Folder']
    
    z = zipfile.ZipFile(zipfn, "r")

    tsfn = z.extract(lspt['OriginalID'] + '.txt')
    
    s = intense.readIntense(tsfn)

    #find the right time zone for the gauge 

    #initiate timezone finder

    tf = TimezoneFinder()

    if s.time_zone == 'UTC': 
        stz = 'UTC'
        #time zone aware in UTC
        ts = s.data.tz_localize(stz, 
                            ambiguous=np.zeros(len(s.data), dtype = bool), #if it's ambiguous, assign to standard time
                           nonexistent ='shift_forward') #if it doesn't exist, shift to next time forward


    else:
        stz = tf.timezone_at(lng=s.longitude, lat=s.latitude)
         #first, localize to local time zone
        ts = s.data.tz_localize(stz, 
                                ambiguous=np.zeros(len(s.data), dtype = bool), 
                                nonexistent ='shift_forward')
        #then, convert to UTC

        ts = ts.tz_convert('UTC')


    #subset to specified number of days before and after the landslide
    lsdate = lspt['date_local_midnight_utc']
    
    #check that these dates are in the range of the time series
    
    start_ts = lsdate - datetime.timedelta(days = daysbefore)
    end_ts = lsdate + datetime.timedelta(days = daysafter + 1)
    
    if (start_ts > lspt['StartDate']) & (end_ts < lspt['EndDate']):

        ts_sub =  ts.loc[(lsdate - datetime.timedelta(days = daysbefore)):(lsdate + datetime.timedelta(days = daysafter + 1))]

        #sort index, just in case for some reason it wasn't already in sorted, or pandas thinks it isn't
        ts_sub.sort_index(axis = 0, inplace = True)

        #interpolate to fill nans in the time series using the pandas built in interpolation method

        ts_fill = ts_sub.interpolate(method = 'time')

        #remove any duplicates by keeping the first duplicated entry only
        #CAUTION: this could create some problems where we miss an hour of rainfall, but I don't know 
        #how else to deal with it at this point
        ts_fill = ts_fill[~ts_fill.index.duplicated()]
    
    else: #case that the time series can't be extracted because time series is not completely covered
        ts_sub = np.array([])
        ts_fill = np.array([])
    
    return ts_sub, ts_fill

In [8]:
def triggering_rain(nhdry, ts, ts_raw, lsdate): 
    
    """
    Gets length and sum of triggering rainfall event from a dry period of length nhdry to the peak hourly
    rainfall and the end of the landslide day
    
    nhdry = length of dry period preceeding landslide in hours
    ts = time series with hourly datetime index and values of hourly precip
    ts_raw = original time series to check how many nans were interpolated in the time range
    lsdate = datetime corresponding to the start of the landslide day  
    
    """
    
    #if the time series is completely empty
    
    if len(ts) == 0:
        
        idxtrst = np.nan
        htrsttopk = np.nan
        cptrsttopk = np.nan
        htrsttoeod = np.nan
        cptrsttoeod = np.nan
        mwtopk = np.nan
        mxhr = np.nan
        idxmxhr = np.nan
        nnantopk = np.nan
        nnantoeod = np.nan
    
    else: 
        
        #find peak hourly rain on the day of the landslide 

        mxhr = ts.loc[lsdate:lsdate+datetime.timedelta(hours = 24)].max()
       

        #if peak hourly rain is nan (e.g. there's no data on the day of the landslide, then)

        if pd.isnull(mxhr):
            
            
            idxmxhr = np.nan
            mxhr = np.nan
            idxtrst = np.nan
            htrsttopk = np.nan
            cptrsttopk = np.nan
            htrsttoeod = np.nan
            cptrsttoeod = np.nan
            mwtopk = np.nan
            nnantopk = np.nan
            nnantoeod = np.nan
            

        else:
            
            #find out when the peak occurred
            idxmxhr = ts.loc[lsdate:lsdate+datetime.timedelta(hours = 24)].idxmax()


            #moving window summing, index is last observation in window (eg. 10 am is 8am + 9am + 10 am)
            mw = ts.rolling(nhdry).sum() 

            #subset moving window time series up to the peak 
            mwtopk = mw.loc[:idxmxhr]

            # find where the nhdry hourly sum is less than 0.01 mm (essentially 0, but catch very small values)
            #, find the nearest index to the peak point (iloc type index)
            ix = mwtopk[mwtopk<0.01].index.get_indexer([idxmxhr], method = 'pad')[0]

            #if we can't find a dry enough period in the time series:
            if ix == -1:

                idxtrst = np.nan
                htrsttopk = np.nan
                cptrsttopk = np.nan
                htrsttoeod = np.nan
                cptrsttoeod = np.nan
                mwtopk = np.nan
                nnantopk = np.nan
                nnantoeod = np.nan
                

            else:

                #get the time stamp index of the start of the triggering rainfall 
                idxtrst = mwtopk[mwtopk<0.01].index[ix] + datetime.timedelta(hours = 1)

                #how long did it rain between the start of the triggering rain and the peak (inclusive)? 

                htrsttopk = ts.loc[idxtrst:idxmxhr].count()

                #how much did it rain between the start of the triggering rain and the peak (inclusive)?
                cptrsttopk = ts.loc[idxtrst:idxmxhr].sum()

                #how long did it rain between the start of the triggering rain and the end of the landslide day? 

                htrsttoeod = ts.loc[idxtrst:lsdate+datetime.timedelta(hours = 24)].count()

                #how much did it rain between the start of the triggering rain and the end of the landslide day?
                cptrsttoeod = ts.loc[idxtrst:lsdate+datetime.timedelta(hours = 24)].sum()
                
                
                #check what percent of the original time series leading up to the peak is nans
                
                nnantopk = ts_raw.loc[idxtrst:idxmxhr].isna().sum()
                
                #check what percent of the original time series leading up to the end of day is nans
                
                nnantoeod = ts_raw.loc[idxtrst:lsdate+datetime.timedelta(hours = 24)].isna().sum()
                             
                
                
                

    return [idxmxhr, 
            mxhr, 
            idxtrst, 
            htrsttopk, 
            cptrsttopk, 
            htrsttoeod, 
            cptrsttoeod, 
            mwtopk, 
            nnantopk,
            nnantoeod]



In [9]:
def get_antecedent(ts, idxdt, h):
    
    """
    get cumulative antecedent precipitation before some hour (e.g. peak precip on landslide day 
    or before start of triggering rainfall)
    
    ts = time series
    idxdt = datetime that you want antecedent for
    h = number of antecedent hours
    
    """
    #for the case that there no start to the triggering rainfall was found
    if pd.isnull(idxdt):
        ante = np.nan
        
    #for the case that the time series is empty
    elif len(ts) == 0:
        ante = np.nan
    
    else:
        
        ante = ts.loc[idxdt - datetime.timedelta(hours = h):idxdt].sum()
    
    return ante

In [10]:
def get_rainmetrics(lspt, gsdrdir):
    
    """
    get info about triggering, event, and antecedent rainfall for a landslide
    
    lspt - a dataframe row from lsdata
    
    """
    print(lspt.name)
    
    [ts, ts_fill] = prep_ts(lspt, gsdrdir, daysbefore = 90, daysafter = 1)
    
    if ts is None: 
        
        rd = None
        
    else:
    
        lsdate = lspt['date_local_midnight_utc']

        #then, get triggering rainfall metrics

        #define triggering event as 3 hours dry to peak or end of day
        [idxmxhr, mxhr, idxtrst, 
         htrsttopk, cptrsttopk, htrsttoeod, cptrsttoeod, _, nnantrsttopk, nnantrsttoeod] = triggering_rain(nhdry = 3,
                                                                                                   ts = ts_fill,
                                                                                                    ts_raw = ts,
                                                                                                   lsdate = lsdate)

        #get event metrics 

        #define event as 48 hours dry to peak or end of day
        [_, _, idxest, 
         hesttopk, cpesttopk, hesttoeod, cpesttoeod, _, nnanesttopk, nnanesttoeod] = triggering_rain(nhdry = 48,
                                                                                ts = ts_fill,
                                                                                ts_raw = ts,
                                                                                lsdate = lsdate)


        #get antecedent metrics 

        #24 hours before start of triggering rain 

        ante24htrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24)

        #7 day before start of triggering rain 
        ante7dtrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24*7)

        #14 day before start of triggering rain 
        ante14dtrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24*14)

        #21 day before start of triggering rain 
        ante21dtrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24*21)

        #28 day before start of triggering rain
        ante28dtrst = get_antecedent(ts = ts_fill, idxdt = idxtrst, h = 24*28)


        #get estimated exceedance probabilities of the hourly maximum intensity

       # mxhrexprob = get_exceedanceprob(ts, mxhr, 10) #minimum 10 years on record


        #put everything into a dictionary to return 


        rd = {"event_id":lspt.name, 
             "tr_start":idxtrst,
             "tr_tpk":idxmxhr,
             "tr_ppk":mxhr,
             "tr_htopk":htrsttopk, 
             "tr_cptopk":cptrsttopk, 
             "tr_htoeod":htrsttoeod, 
             "tr_cptoeod":cptrsttoeod,
             "tr_nnantopk": nnantrsttopk, 
             "tr_nnan_toeod": nnantrsttoeod,
             "e_start":idxest, 
             "e_htopk":hesttopk, 
             "e_cptopk":cpesttopk, 
             "e_htoeod":hesttoeod, 
             "e_cptoeod":cpesttoeod,
             "e_nnantopk":nnanesttopk,
             "e_nnantoeod":nnanesttoeod,               
             "tr_ante24h":ante24htrst, 
             "tr_ante7d":ante7dtrst, 
             "tr_ante14d":ante14dtrst, 
             "tr_ante21d":ante21dtrst, 
             "tr_ante28d":ante28dtrst}

    print(lspt.name)

    return rd

### Get GSDR data

In [11]:
#get the landslide data for which we already have data in the public dataset for now 

def publicGSDR(lspt):
    
    folders = ['Belgium.zip', 
               'Finland.zip', 
               'Germany.zip', 
               'Ireland.zip', 
               'ISD.zip',
               'Japan.zip', 
               'Norway.zip', 
               'UK.zip', 
               'US.zip']
    
    public = lspt['Folder'] in folders 
    
    return public


In [12]:
if mode == 'public':
    lsdata['public'] = lsdata.apply(lambda lspt:publicGSDR(lspt), axis = 1)
    lsdata_gsdr = lsdata[lsdata['public']].copy()
    
else: 
    lsdata_gsdr = lsdata.copy()

In [13]:
#sample a subset of points for testing
#lsdata_gsdr = lsdata_gsdr.sample(20, axis = 0).copy()

In [14]:
len(lsdata_gsdr)

4735

In [15]:
with warnings.catch_warnings(record = True):
    lsdata_gsdr['rain_metrics'] = lsdata_gsdr.apply(lambda lspt:get_rainmetrics(lspt, gsdrdir), 
                                             axis = 1)

1
1
2
2
3
3
4
4
5
5
6
6
7
7
8
8
9
9
10
10
11
11
12
12
13
13
14
14
15
15
16
16
17
17
18
18
19
19
20
20
21
21
22
22
23
23
24
24
25
25
26
26
27
27
54
54
55
55
56
56
57
57
58
58
59
59
61
61
62
62
63
63
64
64
65
65
66
66
67
67
69
69
71
71
73
73
74
74
75
75
76
76
77
77
79
79
80
80
81
81
82
82
83
83
84
84
85
85
87
87
89
89
90
90
91
91
92
92
93
93
94
94
129
129
130
130
131
131
132
132
133
133
134
134
135
135
136
136
137
137
138
138
140
140
142
142
144
144
145
145
146
146
147
147
149
149
150
150
151
151
152
152
153
153
154
154
155
155
157
157
158
158
160
160
161
161
162
162
163
163
164
164
165
165
166
166
167
167
168
168
169
169
170
170
171
171
172
172
173
173
174
174
175
175
176
176
177
177
178
178
179
179
180
180
181
181
182
182
183
183
184
184
185
185
186
186
187
187
188
188
190
190
193
193
194
194
195
195
196
196
197
197
199
199
200
200
201
201
203
203
204
204
205
205
206
206
207
207
209
209
210
210
211
211
212
212
213
213
214
214
215
215
217
217
219
219
221
221
222
222
223
223
224
224
225


1911
1914
1914
1915
1915
1917
1917
1920
1920
1921
1921
1923
1923
1926
1926
1928
1928
1929
1929
1932
1932
1933
1933
1934
1934
1937
1937
1939
1939
1941
1941
1942
1942
1944
1944
1947
1947
1948
1948
1950
1950
1953
1953
1954
1954
1957
1957
1958
1958
1960
1960
1962
1962
1965
1965
1966
1966
1969
1969
1971
1971
1972
1972
1975
1975
1976
1976
1977
1977
1978
1978
1979
1979
1980
1980
1981
1981
1982
1982
1983
1983
1984
1984
1985
1985
1986
1986
1987
1987
1988
1988
1989
1989
1990
1990
1991
1991
1992
1992
1993
1993
1994
1994
1995
1995
1996
1996
1997
1997
1998
1998
1999
1999
2029
2029
2030
2030
2031
2031
2032
2032
2033
2033
2035
2035
2036
2036
2038
2038
2039
2039
2040
2040
2043
2043
2044
2044
2045
2045
2046
2046
2047
2047
2048
2048
2049
2049
2051
2051
2053
2053
2054
2054
2055
2055
2057
2057
2058
2058
2060
2060
2061
2061
2062
2062
2063
2063
2065
2065
2066
2066
2067
2067
2068
2068
2069
2069
2070
2070
2071
2071
2072
2072
2073
2073
2074
2074
2075
2075
2076
2076
2077
2077
2078
2078
2079
2079
2080
2080
2081


3726
3727
3727
3728
3728
3729
3729
3730
3730
3731
3731
3732
3732
3767
3767
3768
3768
3769
3769
3770
3770
3771
3771
3772
3772
3773
3773
3803
3803
3804
3804
3805
3805
3806
3806
3807
3807
3808
3808
3809
3809
3810
3810
3811
3811
3812
3812
3813
3813
3814
3814
3815
3815
3816
3816
3817
3817
3818
3818
3819
3819
3820
3820
3821
3821
3822
3822
3823
3823
3824
3824
3825
3825
3826
3826
3827
3827
3831
3831
3832
3832
3834
3834
3835
3835
3836
3836
3838
3838
3839
3839
3841
3841
3842
3842
3843
3843
3846
3846
3847
3847
3848
3848
3849
3849
3850
3850
3851
3851
3852
3852
3853
3853
3934
3934
3935
3935
3936
3936
3937
3937
3938
3938
3939
3939
3940
3940
3941
3941
3942
3942
3943
3943
3944
3944
3945
3945
3946
3946
3947
3947
3948
3948
3949
3949
3950
3950
3951
3951
3952
3952
3953
3953
3954
3954
3955
3955
3957
3957
3958
3958
3959
3959
3960
3960
3961
3961
3962
3962
3963
3963
3965
3965
3967
3967
3969
3969
3970
3970
3971
3971
3972
3972
3973
3973
3975
3975
3976
3976
3977
3977
3978
3978
3979
3979
3980
3980
3981
3981
3983


5225
5226
5226
5227
5227
5228
5228
5229
5229
5230
5230
5233
5233
5235
5235
5236
5236
5237
5237
5238
5238
5239
5239
5241
5241
5243
5243
5244
5244
5245
5245
5274
5274
5275
5275
5276
5276
5277
5277
5278
5278
5303
5303
5304
5304
5305
5305
5306
5306
5307
5307
5308
5308
5309
5309
5310
5310
5311
5311
5335
5335
5336
5336
5337
5337
5338
5338
5339
5339
5340
5340
5341
5341
5342
5342
5343
5343
5344
5344
5345
5345
5346
5346
5350
5350
5351
5351
5353
5353
5355
5355
5356
5356
5358
5358
5359
5359
5360
5360
5361
5361
5362
5362
5363
5363
5364
5364
5367
5367
5397
5397
5398
5398
5399
5399
5400
5400
5401
5401
5402
5402
5403
5403
5404
5404
5405
5405
5406
5406
5407
5407
5408
5408
5409
5409
5410
5410
5411
5411
5412
5412
5432
5432
5433
5433
5434
5434
5435
5435
5436
5436
5437
5437
5438
5438
5439
5439
5440
5440
5441
5441
5442
5442
5443
5443
5471
5471
5472
5472
5473
5473
5474
5474
5475
5475
5476
5476
5477
5477
5478
5478
5479
5479
5480
5480
5481
5481
5482
5482
5483
5483
5484
5484
5485
5485
5486
5486
5487
5487
5488


6882
6885
6885
6886
6886
6889
6889
6891
6891
6892
6892
6894
6894
6896
6896
6898
6898
6901
6901
6903
6903
6905
6905
6907
6907
6908
6908
6911
6911
6913
6913
6914
6914
6917
6917
6919
6919
6920
6920
6922
6922
6924
6924
6927
6927
6928
6928
6930
6930
6932
6932
6935
6935
6936
6936
6939
6939
6940
6940
6942
6942
6945
6945
6946
6946
6948
6948
6950
6950
6952
6952
6955
6955
6956
6956
6959
6959
6960
6960
6964
6964
6966
6966
6967
6967
6969
6969
6971
6971
6975
6975
6977
6977
6978
6978
6980
6980
6982
6982
6984
6984
6986
6986
6989
6989
6990
6990
6992
6992
6994
6994
6997
6997
6999
6999
7000
7000
7003
7003
7004
7004
7006
7006
7008
7008
7011
7011
7013
7013
7015
7015
7016
7016
7018
7018
7020
7020
7023
7023
7025
7025
7026
7026
7028
7028
7030
7030
7032
7032
7034
7034
7037
7037
7038
7038
7041
7041
7042
7042
7045
7045
7047
7047
7049
7049
7051
7051
7053
7053
7055
7055
7056
7056
7059
7059
7060
7060
7062
7062
7064
7064
7066
7066
7068
7068
7071
7071
7072
7072
7074
7074
7076
7076
7077
7077
7080
7080
7081
7081
7083


8658
8659
8659
8660
8660
8661
8661
8662
8662
8663
8663
8664
8664
8665
8665
8666
8666
8667
8667
8668
8668
8669
8669
8670
8670
8671
8671
8672
8672
8673
8673
8674
8674
8675
8675
8676
8676
8677
8677
8678
8678
8679
8679
8687
8687
8694
8694
8695
8695
8696
8696
8697
8697
8698
8698
8699
8699
8700
8700
8701
8701
8702
8702
8703
8703
8704
8704
8705
8705
8706
8706
8707
8707
8708
8708
8709
8709
8710
8710
8711
8711
8712
8712
8713
8713
8714
8714
8715
8715
8716
8716
8717
8717
8718
8718
8719
8719
8720
8720
8721
8721
8722
8722
8723
8723
8724
8724
8725
8725
8726
8726
8727
8727
8728
8728
8729
8729
8730
8730
8731
8731
8732
8732
8733
8733
8734
8734
8735
8735
8736
8736
8737
8737
8738
8738
8739
8739
8740
8740
8741
8741
8742
8742
8743
8743
8744
8744
8745
8745
8746
8746
8747
8747
8748
8748
8749
8749
8750
8750
8751
8751
8752
8752
8753
8753
8755
8755
8756
8756
8757
8757
8758
8758
8759
8759
8760
8760
8761
8761
8762
8762
8763
8763
8764
8764
8765
8765
8766
8766
8767
8767
8768
8768
8798
8798
8799
8799
8800
8800
8801


In [16]:
joindf = pd.DataFrame(list(lsdata_gsdr['rain_metrics'].values))

lsdata_gsdr_rain = pd.concat([lsdata_gsdr, 
             joindf.set_index(lsdata_gsdr.index)], 
             axis = 1)

lsdata_gsdr_rain.drop(['rain_metrics'], axis = 1, inplace = True)


In [19]:
#strip city names of white space, special characters, etc for R

lsdata_gsdr_rain['city'] = lsdata_gsdr_rain.apply(lambda row:''.join(e for e in row['UC_NM_MN'] if e.isalnum()), 
                                       axis = 1)



In [23]:
lsdata_gsdr_rain.to_csv(resultsdir + 'lsdata_gsdr_rain.csv')

In [10]:
#lsdata_gsdr_rain = pd.read_csv(resultsdir + 'lsdata_gsdr_rain.csv')

In [11]:
lsdata_gsdr_rain.reset_index(drop = True, inplace = True)

### Get annual block maxima from the gauges associated with landslides

In [12]:
def prep_whole_ts(station, gsdrdir):
    
    """
    Function to read the precipitation time series from a gauge, convert it to UTC,
    interpolate missing values
    
    station = row of a dataframe that contains columns: 
        - 'Folder' - folder containing the data (e.g. US.zip)
        - 'OriginalID' - name of station
    gsdrdir = directory containing the GSDR subfolders
    
    
    Returns: 
    ts = original time series
    ts_fill = original time series, interpolated   
    
    """
    
    #get the filename for the station we need
    
    zipfn = gsdrdir + station['Folder']
    
    z = zipfile.ZipFile(zipfn, "r")

    tsfn = z.extract(station['OriginalID'] + '.txt')
    
    s = intense.readIntense(tsfn)

    #find the right time zone for the gauge 

    #initiate timezone finder

    tf = TimezoneFinder()

    if s.time_zone == 'UTC': 
        stz = 'UTC'
        #time zone aware in UTC
        ts = s.data.tz_localize(stz, 
                            ambiguous=np.zeros(len(s.data), dtype = bool), #if it's ambiguous, assign to standard time
                           nonexistent ='shift_forward') #if it doesn't exist, shift to next time forward


    else:
        stz = tf.timezone_at(lng=s.longitude, lat=s.latitude)
         #first, localize to local time zone
        ts = s.data.tz_localize(stz, 
                                ambiguous=np.zeros(len(s.data), dtype = bool), 
                                nonexistent ='shift_forward')
        #then, convert to UTC

        ts = ts.tz_convert('UTC')


    #sort index, just in case for some reason it wasn't already in sorted, or pandas thinks it isn't
    ts.sort_index(axis = 0, inplace = True)
    
    #interpolate to fill nans in the time series using the pandas built in interpolation method

    ts_fill = ts.interpolate(method = 'time')

    #remove any duplicates by keeping the first duplicated entry only
    #CAUTION: this could create some problems where we miss an hour of rainfall, but I don't know 
    #how else to deal with it at this point
    ts_fill = ts_fill[~ts_fill.index.duplicated()]

    return ts, ts_fill

In [13]:
uniquestations = lsdata_gsdr_rain['NewID'].unique() #stations with a landslide associated with it

In [14]:
durations = [1, 3, 6, 12, 24, 48, 100, 200, 500, 1000] #durations to extract block maxima for

In [15]:
len(uniquestations)

579

In [16]:
with warnings.catch_warnings(record = True):
#loop through all stations associated with a landslide and extract annual block maxima at a range of durations

    #loop over stations

    for s in range(len(uniquestations)):
        
        print(s)

        station = lsdata_gsdr_rain[lsdata_gsdr_rain['NewID'] == uniquestations[s]].iloc[0][['ID_HDC_G0', 
                                                                                          'UC_NM_MN', 
                                                                                          'Folder', 
                                                                                          'OriginalID', 
                                                                                           'NewID', 
                                                                                          'Latitude',
                                                                                          'Longitude',
                                                                                          'Recordlength(hours)',
                                                                                          'Recordlength(years)',
                                                                                          'StartDate',
                                                                                          'EndDate',
                                                                                          'Missingdata(%)',
                                                                                          'geometry_y']]
        #get time series

        ts, ts_fill = prep_whole_ts(station, gsdrdir)

        #extract block maxima at a range of durations

        for d in range(len(durations)):

           #moving window on the raw time series
            mw = ts.rolling(durations[d]).sum() #right index is sum of previous d observations. 
            #if there are nans within the window, returns nan

            ann_block_max = mw.resample('Y').max() #take annual max. ignores nans (could be missing data, 
            #will still get max)


            #moving window on the interpolated time series

            mw_fill = ts_fill.rolling(durations[d]).sum() #right index is sum of previous d observations. 
            #if there are nans within the window, returns nan

            ann_block_max_fill = mw_fill.resample('Y').max() #take annual max. ignores nans (could be missing data, 
            #will still get max)


            block_max_df = pd.DataFrame(ann_block_max, columns = ['raw_block_max'])
            block_max_df['fill_block_max'] = ann_block_max_fill.values

            #record the number of non-nan observations in the moving window (all other hours in the year are then nan)
            block_max_df['raw_notnainmw'] = mw.resample('Y').count().values

            #record the number of non-nan observations in the raw time series (all other hours in the year are then nan)
            block_max_df['raw_notnaints'] = ts.resample('Y').count().values


            block_max_df['duration(h)'] = durations[d]
            block_max_df['year'] = block_max_df.index.year


            if d == 0:

                stationdf = block_max_df.copy() 



            else:

                stationdf = pd.concat([stationdf, block_max_df])


        #assign information about the station
        stationinfo = pd.DataFrame([station])

        stationinfo = stationinfo.loc[stationinfo.index.repeat(len(stationdf))]

        stationinfo.set_index(stationdf.index, inplace = True)

        stationdf = pd.concat([stationinfo, stationdf], axis = 1)

        if s == 0: 

            maindf = stationdf.copy()

        else: 

            maindf = pd.concat([maindf, stationdf])
            
        print(s)

0
0
1
1
2
2
3
3
4
4
5
5
6
6
7
7
8
8
9
9
10
10
11
11
12
12
13
13
14
14
15
15
16
16
17
17
18
18
19
19
20
20
21
21
22
22
23
23
24
24
25
25
26
26
27
27
28
28
29
29
30
30
31
31
32
32
33
33
34
34
35
35
36
36
37
37
38
38
39
39
40
40
41
41
42
42
43
43
44
44
45
45
46
46
47
47
48
48
49
49
50
50
51
51
52
52
53
53
54
54
55
55
56
56
57
57
58
58
59
59
60
60
61
61
62
62
63
63
64
64
65
65
66
66
67
67
68
68
69
69
70
70
71
71
72
72
73
73
74
74
75
75
76
76
77
77
78
78
79
79
80
80
81
81
82
82
83
83
84
84
85
85
86
86
87
87
88
88
89
89
90
90
91
91
92
92
93
93
94
94
95
95
96
96
97
97
98
98
99
99
100
100
101
101
102
102
103
103
104
104
105
105
106
106
107
107
108
108
109
109
110
110
111
111
112
112
113
113
114
114
115
115
116
116
117
117
118
118
119
119
120
120
121
121
122
122
123
123
124
124
125
125
126
126
127
127
128
128
129
129
130
130
131
131
132
132
133
133
134
134
135
135
136
136
137
137
138
138
139
139
140
140
141
141
142
142
143
143
144
144
145
145
146
146
147
147
148
148
149
149
150
150
151
151
152


In [24]:
#strip city names of white space, special characters, etc for R

maindf['city'] = maindf.apply(lambda row:''.join(e for e in row['UC_NM_MN'] if e.isalnum()), 
                                       axis = 1)


In [25]:
maindf.to_csv(resultsdir + 'annual_block_maxima.csv')

In [5]:
#maindf = pd.read_csv(resultsdir + 'annual_block_maxima.csv')