# Biomass spatial progation through GBT

Local biomass estimates are spatially propagated through Gradient Boosted Trees regression.
Input features are a set of environmental variables: elevation, precipitation, etc. Data from these features is usually loaded from raster files.

In [47]:
import gdal
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
# Raster data should be downsampled if it is too big to be loaded in memory
def rebin(arr, new_shape):
    """Downsampling array."""
    shape = (new_shape[0], arr.shape[0] / new_shape[0],
             new_shape[1], arr.shape[1] / new_shape[1])
    return arr.reshape(shape).mean(-1).mean(1)

In [2]:
factor = 1 # downsampling factor to acquire array data from bands

# Paths to raster files of environmental data
alt_file = "/home/nelsonsalinas/Documents/cust_layers/alt/alt.tif"

prec_folder = "/home/nelsonsalinas/Documents/cust_layers/precp"
suff = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
prec_files = ["{0}/vent_prec_{1}.tif".format(prec_folder, x) for x in suff]

ave_temp_folder = "/home/nelsonsalinas/Documents/cust_layers/ave_temp/"
suff = map(str, range(1,13))
ave_temp_files = ["{0}/{1}.tif".format(ave_temp_folder, x) for x in suff]

bio_var_folder = "/home/nelsonsalinas/Documents/cust_layers/biovars"
suff = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '14', '15', '16', '17', '18', '19']
bio_var_files = ["{0}/biovar_{1}.tif".format(bio_var_folder, x) for x in suff]

alt_ras = gdal.Open(alt_file)
#prec_ras = [gdal.Open(x) for x in prec_files]
#bio_var_ras = [gdal.Open(x) for x in bio_var_files]

# Matedata from rasters
transform = alt_ras.GetGeoTransform()
altXOrigin = transform[0]
altYOrigin = transform[3]
altPixelWidth = transform[1]
altPixelHeight = transform[5]

# Load raster data as numpy.array objects
alt_band = alt_ras.GetRasterBand(1)
alt_arr = alt_band.ReadAsArray(0, 0, alt_ras.RasterXSize, alt_ras.RasterYSize)
# Replace unknown values with NANs
np.place(alt_arr, ~np.isfinite(alt_arr), np.nan)
np.place(alt_arr, alt_arr < -30, np.nan)
np.place(alt_arr, alt_arr > 6000, np.nan)
#alt_arr = rebin(alt_arr, (alt_arr.shape[0] / factor, alt_arr.shape[1] / factor))

prec_arrs = []
for prfi in prec_files:
    prec_ras = gdal.Open(prfi)
    prec_band = prec_ras.GetRasterBand(1)
    prec_i = prec_band.ReadAsArray(0, 0, prec_ras.RasterXSize, prec_ras.RasterYSize)
    prec_i = prec_i.astype(float)
    #prec_arrs.append(prec_band.ReadAsArray(0, 0, prec_ras.RasterXSize, prec_ras.RasterYSize))
    
    # Replace unknown values with NANs
    np.place(prec_i, ~np.isfinite(prec_i), np.nan)
    np.place(prec_i, prec_i < 0, np.nan)
    np.place(prec_i, prec_i > 2000, np.nan)
    #prec_i = rebin(prec_i, (prec_i.shape[0] / factor, prec_i.shape[1] / factor))
    prec_arrs.append(prec_i)

bio_var_arrs = []
for biofi in bio_var_files:
    bio_var_ras = gdal.Open(biofi) 
    bio_var_band = bio_var_ras.GetRasterBand(1)
    bio_i = bio_var_band.ReadAsArray(0, 0, bio_var_ras.RasterXSize, bio_var_ras.RasterYSize)
    bio_i = bio_i.astype(float)
    # Replace unknown values with NANs
    np.place(bio_i, ~np.isfinite(bio_i), np.nan)
    np.place(bio_i, bio_i < -20, np.nan)
    np.place(bio_i, bio_i > 2000, np.nan)
    #bio_i = rebin(bio_i, (bio_i.shape[0] / factor, bio_i.shape[1] / factor))
    bio_var_arrs.append(bio_i)
    
ave_temp_arrs = []
for avefi in ave_temp_files:
    ave_temp_ras = gdal.Open(avefi) 
    ave_temp_band = ave_temp_ras.GetRasterBand(1)
    ave_temp_i = ave_temp_band.ReadAsArray(0, 0, ave_temp_ras.RasterXSize, ave_temp_ras.RasterYSize)
    ave_temp_i = ave_temp_i.astype(float)
    # Replace unknown values with NANs
    np.place(ave_temp_i, ~np.isfinite(ave_temp_i), np.nan)
    np.place(ave_temp_i, ave_temp_i < -20, np.nan)
    np.place(ave_temp_i, ave_temp_i > 60, np.nan)
    #bio_i = rebin(bio_i, (bio_i.shape[0] / factor, bio_i.shape[1] / factor))
    ave_temp_arrs.append(ave_temp_i)



In [3]:
# Load biomass estimations based on plot data 
biomass = pd.read_csv("biomass_all_20180118.csv")

biomass['X'] = biomass.Longitud.apply(lambda x: int((x - altXOrigin) / altPixelWidth))
biomass['Y'] = biomass.Latitud.apply(lambda y: int((y - altYOrigin) / altPixelHeight))

biopix = biomass.groupby(['X','Y']).size().reset_index().drop(columns=0)

biopix['chaveII'] = np.nan
for row in biopix.itertuples():
    biopix.loc[(biopix.X == row.X) & (biopix.Y == row.Y), 'chaveII'] = \
        biomass[(biomass.X == row.X) & (biomass.Y == row.Y)]['chaveII'].mean()

In [4]:
# Select environmental data from pixels where biomass estimates have been conducted
X = np.empty((biopix.shape[0], (len(ave_temp_arrs) + len(bio_var_arrs) + len(prec_arrs) + 1)))
X[:,0] = alt_arr[biopix.Y, biopix.X]
for inx,arr in enumerate(prec_arrs + ave_temp_arrs + bio_var_arrs):
    ni = inx + 1
    X[:,ni] = arr[biopix.Y, biopix.X]

In [5]:
# GBT estimator does not accept missing data, therefore cells with missing data will be filled with the variable mean 
imp = Imputer()
imp = imp.fit(X)
X = imp.fit_transform(X)

In [74]:
my_scaler = StandardScaler()
my_scaler.fit(X)
X_scaled = my_scaler.transform(X)

In [85]:
# First find optimal number of trees for learning rate 0.1


pars = {'n_estimators' : range(1, 10)
       }

gbtr = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=22, min_samples_leaf = 20,
        max_depth = 6, max_features = "sqrt", subsample = 0.8)

grid_search = GridSearchCV(estimator = gbtr, param_grid = pars, cv = 10, n_jobs=4, \
    scoring= 'neg_mean_absolute_error')

grid_search.fit(X_scaled, biopix.chaveII.as_matrix())

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=6,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=20, min_samples_split=22,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=0.8, verbose=0,
             warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_absolute_error', verbose=0)

In [86]:
grid_search.grid_scores_, grid_search.best_params_, grid_search.best_score_



([mean: -205710.21510, std: 42174.99228, params: {'n_estimators': 1},
  mean: -204075.72671, std: 40574.53088, params: {'n_estimators': 2},
  mean: -207669.29407, std: 35702.03956, params: {'n_estimators': 3},
  mean: -204548.79909, std: 39388.29956, params: {'n_estimators': 4},
  mean: -202813.12523, std: 40668.18235, params: {'n_estimators': 5},
  mean: -209679.31626, std: 37803.18897, params: {'n_estimators': 6},
  mean: -213617.96557, std: 35461.57116, params: {'n_estimators': 7},
  mean: -208242.02925, std: 37498.98554, params: {'n_estimators': 8},
  mean: -212701.53426, std: 44582.67582, params: {'n_estimators': 9}],
 {'n_estimators': 5},
 -202813.12523094259)

In [None]:
# Parameter space
pars = {'min_samples_split' = 3,
        'min_samples_leaf' = 3,
        'max_depth' = [3, 7, 12],
        'max_features' = [7, 12, 17],
        'n_estimators' = [100, 200, 500],
        'subsample' = [0.3, 0.5, 0.7]

In [None]:
# Randomly split input and response data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, biopix.chaveII.as_matrix(), test_size=0.2)

In [None]:
# Train the regression model
gbtr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1)
gbtr.fit(X_train, y_train)

In [None]:
# Estimate the r regression coefficient
r2_score(y_test, gbtr.predict(X_test))