# 02 - Identify Gauges

This notebook:
- Identifies all gauges in the GSDR within 25 km of each landslide and records their distance to the landslide.
- Identifies all cities with at least 5 landslides that have nominal rainfall data coverage for further analysis

The output of this notebook is: 
- 'ls_get_gsdr_simple.csv', a csv file of landslides and all nearby gauges from the GSDR that rainfall data should be extracted from.  This file is read into 03_ExtractGSDR

**Data required**

Processed data: ls_urban_ts_rf_u.pkl

Original data: GSDR metadata. This data is not public, please contact the authors in case of questions.



In [None]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pyproj
import shapely
from shapely.ops import transform

In [None]:
#read landslide data.  These are rainfall-triggered urban landslides with a known day.
landslides = pd.read_pickle('ls_urban_ts_rf_u.pkl')

In [None]:
#read GSDR metadata file 

#the GSDR metadata file is not public.  Please contact the authors in case of questions.
gsdrmeta = pd.read_excel("../QC_Summary_FL13_RB13_01.xlsx", 
                        sheet_name = "QC_Summary_FL13_RB13_01", 
                        header = 1, 
                        usecols = "A:J")

#convert to a geopandas dataframe
gsdrmeta = gpd.GeoDataFrame(gsdrmeta,
                                 geometry=gpd.points_from_xy(gsdrmeta.Longitude, gsdrmeta.Latitude),
                                 crs = "EPSG:4326")


#convert record start and end dates to datetime format
gsdrmeta['Start Date'] = pd.to_datetime(gsdrmeta['Start Date'])

gsdrmeta['End Date'] = pd.to_datetime(gsdrmeta['End Date'])


In [None]:
#rename columns to remove white space
gsdrmeta.rename(dict(zip(gsdrmeta.columns, ['Folder', 'OriginalID', 'NewID', 'Latitude', 'Longitude', 'Recordlength(hours)', 'Recordlength(years)', 
'StartDate', 'EndDate', 'Missingdata(%)', 'geometry'])), axis = 'columns', inplace = True)

### Identify all gauges within 25 km of a landslide

In [None]:
#write a function 

def get_stations(lspt, 
                gsdrmeta,
                buffer_dist = 25000):
    
    
    """
    Function to get the all of the GSDR stations within a defined buffer distance to a landslide point
    
    lspt = a landslide point (with a DATE and spatial location in WGS84)
    gsdrmeta = pandas dataframe with StartDate, EndDate, and a spatial location in in WGS84
    buffer_dist = search radius in meters
    
    Returns a pandas dataframe with all gauges within the defined buffer distance and an indicator of 
    whether they have data coverage
    
    """
   

    wgs84 = pyproj.CRS('EPSG:4326')

    #define azimuthal equidistant crs centered on the landslide

    aeqd = pyproj.Proj(proj='aeqd', ellps='WGS84', datum='WGS84', lat_0=lspt.geometry.y, lon_0=lspt.geometry.x).srs

    #reproject the landslide point

    project = pyproj.Transformer.from_crs(wgs84, aeqd, always_xy=True).transform
    lspt_a = transform(project, lspt.geometry)


    #buffer the landlide point
    lspt_ab = lspt_a.buffer(buffer_dist)

    #reproject the station data 
    ga = gsdrmeta.to_crs(aeqd)

    #get the indices of all stations within 50 km of the landslide point
    sidx = ga.sindex.query(lspt_ab, predicate = 'intersects')
    
    #columns of the metadata dataframe to use as dictionary keys
    
    gscols = gsdrmeta.columns
    
    #check if it's empty
    
    if len(sidx) == 0: 
        
        gageinfo = dict(zip(gscols, [None]*len(gscols)))
        
        gageinfo['flag'] = 'no close gages'
        
        gageinfodf = pd.DataFrame([gageinfo])
        
    else: 
        
        #continue

        #these are the stations within the buffer distance of the landslide point
        gs = ga.iloc[sidx].copy()

        #is there theoretically coverage?
        gs['coverage'] = (lspt['DATE'] > gs['StartDate']) & (lspt['DATE'] < gs['EndDate'])
        
        #check if there's no coverage 
        
        if sum(gs['coverage']) == 0: 
            
            gageinfo = dict(zip(gscols, [None]*len(gscols)))
        
            gageinfo['flag'] = 'no nominal coverage'
            
            gageinfodf = pd.DataFrame([gageinfo])
        
        else: 
            #continue

            #only stations that theoretically have coverage
            gsc = gs[gs['coverage']].copy()

            #get the distance from the stations to the landslide point
            dsl = gsc.distance(lspt_a)

            #record the distance
            gsc.loc[dsl.index, 'station_dist'] = dsl.values 

            #project stations back to WGS84
            
            gsc = gsc.to_crs(wgs84)          

            gageinfo = gsc

            gageinfo['flag'] = 'coverage'
            
            gageinfodf = gageinfo.copy()
            
    gageinfodf['lsidx'] = lspt.name
                
    return gageinfodf
    

In [None]:
#get all gauges within 25 km of each landslide and compile into one dataframe

for l in range(len(landslides)):
       
    lspt = landslides.iloc[l]
    
    tempdf = get_stations(lspt, 
                gsdrmeta,
                buffer_dist = 25000)
    
    if l == 0:
        
        gageinfo = tempdf.copy()
        
    else:
        
        gageinfo = pd.concat([gageinfo, tempdf], axis = 0)
    
    print('end {}/{}'.format(l, len(landslides)))

In [None]:
#join to landslides dataframe (right join)

ls_gages = landslides.merge(gageinfo, 
                           how = 'right',
                            left_index = True,
                           right_on = 'lsidx')



In [None]:
#subset to only landslides with coverage

ls_gages_cov = ls_gages[ls_gages['coverage'] == True]

### Identify cities with at least 5 landslides with nominal coverage

In [None]:
#get landslides that have at least one nearby gauge with coverage

ls_gages_cov_one = ls_gages_cov.drop_duplicates('lsidx')

In [None]:
#count how many are in each city

nls_per_city = ls_gages_cov_one.groupby('ID_HDC_G0').count()['lsidx'].sort_values()

In [None]:
idgt5 = nls_per_city[nls_per_city>=5].index.values

In [None]:
#flag which landslide/gauge pairs are in a city with >5 landslides
ls_gages['gt5'] = ls_gages.apply(lambda row:row['ID_HDC_G0'] in idgt5, axis = 1)

In [None]:
#subset to landslides/gauge pairs with nominal coverage in cities with at least 5 landslides
ls_get_gsdr = ls_gages[(ls_gages['coverage'] == True) & (ls_gages['gt5'])]

In [None]:
#pickle for safekeeping
#ls_get_gsdr.to_pickle('20220908_ls_get_gsdr.pkl')

In [None]:
#simplify to columns needed to extract rainfall data 

ls_get_gsdr_simple = ls_get_gsdr.loc[:, ['inventory', 'src_index', 'inventory_id', 'inventory_id_name', 'lsidx',
            'ID_HDC_G0', 'UC_NM_MN', 'date_local_midnight_utc', 'Folder',
       'OriginalID', 'NewID', 'Latitude', 'Longitude', 'Recordlength(hours)',
       'Recordlength(years)', 'StartDate', 'EndDate', 'Missingdata(%)',
       'geometry_y', 'flag', 'coverage', 'station_dist']]

In [None]:
#save to csv
ls_get_gsdr_simple.to_csv('ls_get_gsdr_simple.csv')