# Process Globe-LFMC samples and extract DEM Data
Extracts the samples for locations in the CONUS from the Globe-LFMC spread sheet and adds the normalised DEM, climate zone, and other auxiliary data. DEM data is from the GEE SRTM DEM product, extracted using the MODIS projection and scale. Sites within the same MODIS pixel are merged. The following files are created:
- `LFMC_CONUS.csv`: CONUS data extracted from the Globe-LFMC dataset
- `LFMC_sites.csv`: sites extracted from the Globe-LFMC CONUS data and augmented with normalised DEM and location data
- `LFMC_samples.csv`: Globe-LFMC CONUS sample data augmented with auxiliary variables

### Notes
1. The `Globe-LFMC-v2.xlsx` should exist in the `INPUT_DIR` directory - by default, a sub-directory of `DATA_DIR`
2. The tiff containing the Koppen climate zone data (`Beck_KG_V1_present_0p0083.tif` available from https://figshare.com/articles/dataset/Present_and_future_K_ppen-Geiger_climate_classification_maps_at_1-km_resolution/6396959/2) should also be in `INPUT_DIR`, as should either the `legend.txt` file (available from the same site) or `Climate_zones.csv`. If `Climate_zones.csv` doesn't exist, it needs to be created from `legend.txt` by uncommenting and running the first code cell under "Climate zone processing".
3. `EXTRACT_NAME` is a sub-directory of `DATA_DIR`. It will be created if it doesn't exist. All data files created by this and other data extraction notebooks will be located in sub-directories of this directory.
4. `LFMC_CONUS.csv` is created in the `INPUT_DIR` directory.
5. All other created files are CSVs and stored in the `SAMPLE_DIR` directory, by default a sub-directory of `DATA_DIR/EXTRACT_NAME`.
6. The samples data output by this code is further processed by the MODIS extraction code to remove the snow samples.

In [1]:
import os
import numpy as np
import pandas as pd
import time
from datetime import datetime
from datetime import timedelta

import initialise
import common
from data_extract_utils import normalise_dem
from data_extract_utils import extract_koppen_data
from model_utils import normalise

Define input and output files

In [2]:
# Globe-LFMC file and sheet name
GLOBE_LFMC = os.path.join(common.SOURCE_DIR, "Globe-LFMC-v2.xlsx")
SHEET_NAME = "LFMC data"

# File Names
LFMC_RAW = os.path.join(common.SOURCE_DIR, "LFMC_CONUS.csv")                   # CSV of CONUS data extracted from the Globe-LFMC dataset
KOPPEN_TIF = os.path.join(common.SOURCE_DIR, 'Beck_KG_V1_present_0p0083.tif')  # Tiff of Koppen climate zone values
LEGEND_FILE = os.path.join(common.SOURCE_DIR, 'legend.txt')                    # Text file with Koppen climate zone legend
KOPPEN_LEGEND = os.path.join(common.SOURCE_DIR, 'Climate_zones.csv')           # CSV of Koppen climate zone legend

if not os.path.exists(common.DATASETS_DIR):
    os.makedirs(common.DATASETS_DIR)


Other constants/parameters

In [3]:
# DEM Product, projection and resolution
DEM_PRODUCT = 'USGS/SRTMGL1_003'
DEM_PROJ = "EPSG:4326"
DEM_SCALE = 30

# Floating point precision
FLOAT_PRE = 5

Initialise Google Earth Engine

In [4]:
import ee
ee.Initialize()

## Point-based Processing
Extracts the DEM data from GEE usong the native DEM projection and resolution. Keeps the sample site latitude and longitude, and adds the elevation/slope/aspect.
- Parameter:
 - sites: Dataframe of sample sites
- Returns: Dataframe of sites, latitude and longitude and the added elevation/slope/aspect attributes

In [5]:
def sites_by_point(sites):
    dem_image = ee.Terrain.products(ee.Image(DEM_PRODUCT))
    points = [ee.Geometry.Point(site.Longitude, site.Latitude) for x, site in sites.iterrows()]
    dem_col = ee.ImageCollection(dem_image)
    col_list = [dem_col.getRegion(point, DEM_SCALE, DEM_PROJ) for point in points]
    dem_list = ee.List(col_list).getInfo()
    dem_data = pd.DataFrame([item[1] for item in dem_list], columns=dem_list[0][0])
    dem_data.id = sites.Site
    dem_data.rename(columns={"id": "Site"}, inplace=True)
    dem_df = sites.merge(dem_data[['Site', 'elevation', 'slope', 'aspect']]).\
        round({'elevation': 0, 'slope': 0, 'aspect': 0}).\
        astype({'elevation': 'int32', 'slope': 'int32', 'aspect': 'int32'})
    dem_df.columns = ['Site', 'Latitude', 'Longitude', 'Elevation', 'Slope', 'Aspect']
    return dem_df

## Pixel-based Processing
Extracts the DEM data at the requested projection and resolution. Terrain.products adds the slope and aspect. A reducer is used so terrain product info is added before resampling.
- Parameters:
 - sites: dataframe of sampling sites
 - scale/proj: the required scale/proj (e.g. MODIS scale/proj - or map scale/proj)
 - maxPixels: Reducer parameter specifying the maximum number of DEM pixels to use to compute each down-sampled pixel. Doesn't need to be exact but make sure it's large enough - 512 is good for MODIS
- Returns: Dataframe of sites with latitude and longitude set to the pixel centroid as returned by GEE and the added elevation/slope/aspect attributes

In [6]:
def sites_by_pixel(sites, scale, proj, maxPixels):
    dem_image = ee.Terrain.products(ee.Image(DEM_PRODUCT)).reduceResolution(ee.Reducer.mean(), maxPixels=maxPixels)
    points = [ee.Geometry.Point(site.Longitude, site.Latitude) for x, site in sites.iterrows()]
    dem_col = ee.ImageCollection(dem_image)
    col_list = [dem_col.getRegion(point, scale, proj) for point in points]
    dem_list = ee.List(col_list).getInfo()
    dem_data = pd.DataFrame([item[1] for item in dem_list], columns=dem_list[0][0])
    dem_data.id = sites.Site
    dem_data.columns = ['Site', 'Longitude', 'Latitude', 'time', 'Elevation', 'Slope', 'Aspect', 'hillshade']
    dem_df = dem_data.drop(columns=["time", "hillshade"]).\
        round({'Elevation': 0, 'Slope': 0, 'Aspect': 0}).\
        astype({'Elevation': 'int32', 'Slope': 'int32', 'Aspect': 'int32'})
    return dem_df

## Main Processing
- If the LFMC_RAW file already exists, load it.
- Otherwise extract Globe LFMC data from the excel workbook sheet and save to the LFMC_RAW file.

In [7]:
if os.path.exists(LFMC_RAW):
    LFMC_data = pd.read_csv(LFMC_RAW, index_col=0, float_precision="high", parse_dates=["Sampling date"],
                           dtype={8: str, 10: np.int32, 11: np.int16, 14: np.int16, 23: str})
else:    
    LFMC_data = pd.read_excel(GLOBE_LFMC, SHEET_NAME).dropna(how="all")
    LFMC_data = LFMC_data[(LFMC_data.Country == "USA")
                          & (LFMC_data["State/Region"] != "Alaska")
                          & (LFMC_data["Sampling date"] >= common.START_DATE)]
    LFMC_data.to_csv(LFMC_RAW)
    LFMC_data = LFMC_data.astype(dtype={'Sampling year': np.int32, 'Protocol': np.int16, 'Units': np.int16})
LFMC_data

### Site processing
Extract the unique sites from the Globe-LFMC data

In [9]:
LFMC_data["Site"] = LFMC_data.ID.str.rsplit("_", 1, expand=True)[0]
sites = LFMC_data[["Site", "Latitude", "Longitude"]].drop_duplicates().reset_index(drop=True)
sites

Unnamed: 0,Site,Latitude,Longitude
0,C4_1,40.214300,-112.217480
1,C4_2,40.213630,-112.213950
2,C4_3,40.210800,-112.207900
3,C4_4,40.211510,-112.212230
4,C4_5,40.212460,-112.212860
...,...,...,...
942,C16_11,34.123811,-118.583219
943,C16_12,34.124407,-118.583198
944,C16_13,34.080401,-118.557810
945,C16_14,34.054041,-118.805703


Retrieve the DEM data from GEE - run either sitesByPixel (pixel mode) or sitesByPoint (point mode)

In [10]:
dem_df = sites_by_pixel(sites, common.SCALE, common.PROJ, 512)
dem_df

Unnamed: 0,Site,Longitude,Latitude,Elevation,Slope,Aspect
0,C4_1,-112.218682,40.214582,1572,2,178
1,C4_2,-112.213226,40.214582,1577,2,173
2,C4_3,-112.206327,40.210415,1582,2,188
3,C4_4,-112.211783,40.210415,1576,2,187
4,C4_5,-112.211783,40.210415,1576,2,187
...,...,...,...,...,...,...
942,C16_11,-118.579737,34.122915,402,13,204
943,C16_12,-118.579737,34.122915,402,13,204
944,C16_13,-118.556577,34.081249,389,15,229
945,C16_14,-118.802427,34.052082,442,19,109


Normalise the DEM data and save the sites data. Note: sites with same latitude/longitude are *not* merged yet

In [11]:
dem_norm = normalise_dem(dem_df.set_index('Site'), input_columns=['Longitude', 'Latitude', 'Elevation', 'Slope', 'Aspect'], precision=FLOAT_PRE)
dem_norm = dem_norm.reset_index()
dem_norm

Unnamed: 0,Site,Longitude,Latitude,Elevation,Slope,Aspect,Long_sin,Long_cos,Lat_norm,Aspect_sin,Aspect_cos
0,C4_1,-112.21868,40.21458,0.26200,0.02222,178,-0.92575,-0.37814,0.72341,-0.03490,0.99939
1,C4_2,-112.21323,40.21458,0.26283,0.02222,173,-0.92578,-0.37805,0.72341,-0.12187,0.99255
2,C4_3,-112.20633,40.21042,0.26367,0.02222,188,-0.92583,-0.37794,0.72339,0.13917,0.99027
3,C4_4,-112.21178,40.21042,0.26267,0.02222,187,-0.92579,-0.37803,0.72339,0.12187,0.99255
4,C4_5,-112.21178,40.21042,0.26267,0.02222,187,-0.92579,-0.37803,0.72339,0.12187,0.99255
...,...,...,...,...,...,...,...,...,...,...,...
942,C16_11,-118.57974,34.12292,0.06700,0.14444,204,-0.87815,-0.47838,0.68957,0.40674,0.91355
943,C16_12,-118.57974,34.12292,0.06700,0.14444,204,-0.87815,-0.47838,0.68957,0.40674,0.91355
944,C16_13,-118.55658,34.08125,0.06483,0.16667,229,-0.87835,-0.47803,0.68934,0.75471,0.65606
945,C16_14,-118.80243,34.05208,0.07367,0.21111,109,-0.87629,-0.48179,0.68918,-0.94552,0.32557


### Date processing
Create dataframe with dates and normalised day-of-year

In [12]:
days = pd.date_range(common.START_DATE, common.END_DATE, closed="left")
doy = pd.Series(normalise(days.dayofyear, method='range', range=(1, 366), out_range=(-np.pi, np.pi)))
days_df = pd.DataFrame({"Date": days, 
                        "Day_sin": doy.transform(np.sin).round(FLOAT_PRE),
                        "Day_cos": doy.transform(np.cos).round(FLOAT_PRE)})
days_df

Unnamed: 0,Date,Day_sin,Day_cos
0,2000-03-01,-0.85876,-0.51237
1,2000-03-02,-0.86746,-0.49751
2,2000-03-03,-0.87589,-0.48251
3,2000-03-04,-0.88407,-0.46736
4,2000-03-05,-0.89198,-0.45207
...,...,...,...
6875,2018-12-27,0.08596,-0.99630
6876,2018-12-28,0.06880,-0.99763
6877,2018-12-29,0.05162,-0.99867
6878,2018-12-30,0.03442,-0.99941


### Climate zone processing

#### Create the Koppen legend csv file
If the `KOPPEN_LEGEND` file doesn't exist, uncomment and run the following cell. This will create it from the `legend.txt` file that can be downloaded with the climate zones tiff.

In [None]:
# legend = {}
# count = 0
# with open(LEGEND_FILE) as fp:
#     for ln in fp:
#         line = ln.split(':')
#         number = line[0].strip()
#         if number.isnumeric():
#             count += 1
#             key = int(line[0].strip())
#             parts = line[1].split('[')
#             colour = parts[1].strip().strip(']').split(' ')
#             code = parts[0].strip()[:3]
#             descr = parts[0].strip()[5:]
#             value = {'Number': number, 'Code': code, 'Description': descr, 'Red': colour[0], 'Green': colour[1], 'Blue': colour[2]}
#             legend[key] = value
# legend_df = pd.DataFrame.from_dict(legend, orient='index')
# legend_df.to_csv(KOPPEN_LEGEND, index=False)

In [None]:
#### Extract climate zones for sites
Extract the climate zone for each site and add to the sites data.

In [14]:
cz_columns = ['Czone1', 'Czone2', 'Czone3']
extract_koppen_data(KOPPEN_TIF, KOPPEN_LEGEND, sites, loc_columns=['Longitude', 'Latitude'], cz_columns=cz_columns)
dem_norm = dem_norm.merge(sites[['Site', 'Czone1', 'Czone2', 'Czone3']], on='Site')
dem_norm.to_csv(common.LFMC_SITES, index=False)
dem_norm

Unnamed: 0,Site,Longitude,Latitude,Elevation,Slope,Aspect,Long_sin,Long_cos,Lat_norm,Aspect_sin,Aspect_cos,Czone1,Czone2,Czone3
0,C4_1,-112.21868,40.21458,0.26200,0.02222,178,-0.92575,-0.37814,0.72341,-0.03490,0.99939,B,BS,BSk
1,C4_2,-112.21323,40.21458,0.26283,0.02222,173,-0.92578,-0.37805,0.72341,-0.12187,0.99255,B,BS,BSk
2,C4_3,-112.20633,40.21042,0.26367,0.02222,188,-0.92583,-0.37794,0.72339,0.13917,0.99027,B,BS,BSk
3,C4_4,-112.21178,40.21042,0.26267,0.02222,187,-0.92579,-0.37803,0.72339,0.12187,0.99255,B,BS,BSk
4,C4_5,-112.21178,40.21042,0.26267,0.02222,187,-0.92579,-0.37803,0.72339,0.12187,0.99255,B,BS,BSk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,C16_11,-118.57974,34.12292,0.06700,0.14444,204,-0.87815,-0.47838,0.68957,0.40674,0.91355,C,Cs,Csa
943,C16_12,-118.57974,34.12292,0.06700,0.14444,204,-0.87815,-0.47838,0.68957,0.40674,0.91355,C,Cs,Csa
944,C16_13,-118.55658,34.08125,0.06483,0.16667,229,-0.87835,-0.47803,0.68934,0.75471,0.65606,C,Cs,Csa
945,C16_14,-118.80243,34.05208,0.07367,0.21111,109,-0.87629,-0.48179,0.68918,-0.94552,0.32557,C,Cs,Csa


### Sample processing
Create the auxiliary dataset from the samples

##### Step 1: Merge sites and sample data to add the site longitude and latitude to the samples

In [15]:
samples = dem_norm[["Site", "Longitude", "Latitude"]].merge(
    LFMC_data[["ID", "Site", "Sampling date", "Sampling year", "Land Cover", "LFMC value"]])
samples

Unnamed: 0,Site,Longitude,Latitude,ID,Sampling date,Sampling year,Land Cover,LFMC value
0,C4_1,-112.21868,40.21458,C4_1_1,2005-06-20,2005,Shrubland,156.763000
1,C4_1,-112.21868,40.21458,C4_1_2,2005-07-05,2005,Shrubland,128.277000
2,C4_1,-112.21868,40.21458,C4_1_3,2005-07-21,2005,Shrubland,92.482000
3,C4_1,-112.21868,40.21458,C4_1_4,2005-08-08,2005,Shrubland,82.093000
4,C4_1,-112.21868,40.21458,C4_1_5,2005-08-23,2005,Shrubland,78.953000
...,...,...,...,...,...,...,...,...
126410,C16_15,-118.80827,34.05625,C16_15_3,2000-04-12,2000,"Tree cover, needleleaved, evergreen, closed to...",104.246074
126411,C16_15,-118.80827,34.05625,C16_15_4,2000-05-26,2000,"Tree cover, needleleaved, evergreen, closed to...",98.427023
126412,C16_15,-118.80827,34.05625,C16_15_5,2000-06-20,2000,"Tree cover, needleleaved, evergreen, closed to...",88.735638
126413,C16_15,-118.80827,34.05625,C16_15_6,2000-07-21,2000,"Tree cover, needleleaved, evergreen, closed to...",80.378317


##### Step 2: Merge samples for same latitude/longitude/date

In [16]:
# Generate a common site id for each site with the same latitude and longitude
merge_columns = ["Latitude", "Longitude"]
sites_temp = dem_norm[merge_columns + ["Site"]].groupby(merge_columns, as_index=False).min()
# Merge samples for same year and location
samples = samples.merge(sites_temp, on=merge_columns, suffixes=("_x", None))
groupby_cols = ["Latitude", "Longitude", "Sampling date"]
data_cols = {"ID": "min",                                    # Unique sample ID is the first ID of the merged samples
             "Sampling year": "min",                         # They should all be the same, but need to select one
             "Land Cover": lambda x: pd.Series.mode(x)[0],   # Most common land cover value
             "LFMC value": "mean",                           # mean LFMC value
             "Site": "min"}                                  # Site id from sites_temp
samples = samples[groupby_cols + list(data_cols.keys())].groupby(groupby_cols, as_index=False).\
              agg(data_cols).round({"LFMC value": FLOAT_PRE})
samples

Unnamed: 0,Latitude,Longitude,Sampling date,ID,Sampling year,Land Cover,LFMC value,Site
0,25.99792,-97.57114,2017-06-15,C6_486_1,2017,"Tree cover, needleleaved, deciduous, closed to...",147.00000,C6_486
1,25.99792,-97.57114,2017-07-11,C6_486_4,2017,"Tree cover, needleleaved, deciduous, closed to...",202.66667,C6_486
2,25.99792,-97.57114,2017-07-15,C6_486_7,2017,"Tree cover, needleleaved, deciduous, closed to...",345.00000,C6_486
3,25.99792,-97.57114,2017-08-15,C6_486_10,2017,"Tree cover, needleleaved, deciduous, closed to...",233.75000,C6_486
4,25.99792,-97.57114,2017-09-21,C6_486_12,2017,"Tree cover, needleleaved, deciduous, closed to...",191.00000,C6_486
...,...,...,...,...,...,...,...,...
68770,48.90208,-116.29750,2015-08-04,C6_91_11,2015,"Tree cover, needleleaved, evergreen, closed (>...",134.50000,C6_91
68771,48.90208,-116.29750,2015-08-12,C6_91_13,2015,"Tree cover, needleleaved, evergreen, closed (>...",115.00000,C6_91
68772,48.90208,-116.29750,2015-08-26,C6_91_15,2015,"Tree cover, needleleaved, evergreen, closed (>...",54.00000,C6_91
68773,48.90208,-116.29750,2015-09-10,C6_91_17,2015,"Tree cover, needleleaved, evergreen, closed (>...",79.50000,C6_91


##### Step 3: Add the normalised auxiliary variables (day-of-year, location and DEM) to the samples

In [17]:
aux_df = samples[["ID", "Latitude", "Longitude", "Sampling date", "Sampling year", "Land Cover", "LFMC value", "Site"]
                ].merge(days_df, left_on="Sampling date", right_on = "Date").drop(columns="Date").\
                merge(dem_norm.drop(columns=["Longitude", "Latitude"]), on="Site").sort_values("ID")
aux_df = aux_df[['ID', 'Latitude', 'Longitude', 'Sampling date', 'Sampling year', 'Land Cover', 'LFMC value', 'Site',
                 'Czone1', 'Czone2', 'Czone3',
                 'Day_sin', 'Day_cos',
                 'Long_sin', 'Long_cos', 'Lat_norm', 'Elevation', 'Slope', 'Aspect_sin', 'Aspect_cos']]
aux_df.to_csv(common.LFMC_SAMPLES, index=False)
aux_df

Unnamed: 0,ID,Latitude,Longitude,Sampling date,Sampling year,Land Cover,LFMC value,Site,Czone1,Czone2,Czone3,Day_sin,Day_cos,Long_sin,Long_cos,Lat_norm,Elevation,Slope,Aspect_sin,Aspect_cos
68073,C13_1_1,46.63541,-114.58301,2012-06-07,2012,"Tree cover, needleleaved, evergreen, closed to...",97.59127,C13_1,D,Ds,Dsc,-0.40936,0.91237,-0.90936,-0.41601,0.75909,0.27200,0.14444,-0.94552,0.32557
68080,C13_1_10,46.63541,-114.58301,2012-08-09,2012,"Tree cover, needleleaved, evergreen, closed to...",139.17631,C13_1,D,Ds,Dsc,0.61528,0.78831,-0.90936,-0.41601,0.75909,0.27200,0.14444,-0.94552,0.32557
68074,C13_1_11,46.63541,-114.58301,2012-08-16,2012,"Tree cover, needleleaved, evergreen, closed to...",134.22227,C13_1,D,Ds,Dsc,0.70558,0.70863,-0.90936,-0.41601,0.75909,0.27200,0.14444,-0.94552,0.32557
68077,C13_1_12,46.63541,-114.58301,2012-08-21,2012,"Tree cover, needleleaved, evergreen, closed to...",126.68514,C13_1,D,Ds,Dsc,0.76389,0.64535,-0.90936,-0.41601,0.75909,0.27200,0.14444,-0.94552,0.32557
68068,C13_1_13,46.63541,-114.58301,2012-08-30,2012,"Tree cover, needleleaved, evergreen, closed to...",123.16282,C13_1,D,Ds,Dsc,0.85432,0.51974,-0.90936,-0.41601,0.75909,0.27200,0.14444,-0.94552,0.32557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55321,C6_9_68,44.37292,-68.26178,2017-07-31,2017,"Tree cover, needleleaved, evergreen, closed to...",127.20000,C6_9,D,Df,Dfb,0.47116,0.88205,-0.92889,0.37037,0.74652,0.01917,0.06667,0.85717,0.51504
55328,C6_9_73,44.37292,-68.26178,2017-08-07,2017,"Tree cover, needleleaved, evergreen, closed to...",128.20000,C6_9,D,Df,Dfb,0.57377,0.81901,-0.92889,0.37037,0.74652,0.01917,0.06667,0.85717,0.51504
55311,C6_9_78,44.37292,-68.26178,2017-08-14,2017,"Tree cover, needleleaved, evergreen, closed to...",128.60000,C6_9,D,Df,Dfb,0.66806,0.74410,-0.92889,0.37037,0.74652,0.01917,0.06667,0.85717,0.51504
55330,C6_9_83,44.37292,-68.26178,2017-09-13,2017,"Tree cover, needleleaved, evergreen, closed to...",132.60000,C6_9,D,Df,Dfb,0.94836,0.31719,-0.92889,0.37037,0.74652,0.01917,0.06667,0.85717,0.51504
