# Biomass spatial progation through GBT

Local biomass estimates are spatially propagated through Gradient Boosted Trees regression.
Input features are a set of environmental variables: elevation, precipitation, etc. Data from these features is usually loaded from raster files.

In [None]:
import gdal
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
# Raster data should be downsampled if it is too big to be loaded in memory
def rebin(arr, new_shape):
    """Downsampling array."""
    shape = (new_shape[0], arr.shape[0] / new_shape[0],
             new_shape[1], arr.shape[1] / new_shape[1])
    return arr.reshape(shape).mean(-1).mean(1)

In [None]:
factor = 1 # downsampling factor to acquire array data from bands

# Paths to raster files of environmental data
alt_file = "/home/nelsonsalinas/Documents/cust_layers/alt/alt.tif"

prec_folder = "/home/nelsonsalinas/Documents/cust_layers/precp"
suff = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
prec_files = ["{0}/vent_prec_{1}.tif".format(prec_folder, x) for x in suff]

bio_var_folder = "/home/nelsonsalinas/Documents/cust_layers/biovars"
suff = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '14', '15', '16', '17', '18', '19']
bio_var_files = ["{0}/biovar_{1}.tif".format(bio_var_folder, x) for x in suff]

alt_ras = gdal.Open(alt_file)
#prec_ras = [gdal.Open(x) for x in prec_files]
#bio_var_ras = [gdal.Open(x) for x in bio_var_files]

# Matedata from rasters
transform = alt_ras.GetGeoTransform()
altXOrigin = transform[0]
altYOrigin = transform[3]
altPixelWidth = transform[1]
altPixelHeight = transform[5]

# Load raster data as numpy.array objects
alt_band = alt_ras.GetRasterBand(1)
alt_arr = alt_band.ReadAsArray(0, 0, alt_ras.RasterXSize, alt_ras.RasterYSize)
np.place(alt_arr, alt_arr < 0, np.nan) # Replace unknown values with NANs
alt_arr = rebin(alt_arr, (alt_arr.shape[0] / factor, alt_arr.shape[1] / factor))

prec_arrs = []
for prfi in prec_files:
    prec_ras = gdal.Open(prfi)
    prec_band = prec_ras.GetRasterBand(1)
    prec_i = prec_band.ReadAsArray(0, 0, prec_ras.RasterXSize, prec_ras.RasterYSize)
    prec_i = prec_i.astype(float)
    #prec_arrs.append(prec_band.ReadAsArray(0, 0, prec_ras.RasterXSize, prec_ras.RasterYSize))
    np.place(prec_i, prec_i < 0, np.nan) # Replace unknown values with NANs
    prec_i = rebin(prec_i, (prec_i.shape[0] / factor, prec_i.shape[1] / factor))
    prec_arrs.append(prec_i)

bio_var_arrs = []
for biofi in bio_var_files:
    bio_var_ras = gdal.Open(biofi) 
    bio_var_band = bio_var_ras.GetRasterBand(1)
    bio_i = bio_var_band.ReadAsArray(0, 0, bio_var_ras.RasterXSize, bio_var_ras.RasterYSize)
    bio_i = bio_i.astype(float)
    np.place(bio_i, bio_i < 0, np.nan) # Replace unknown values with NANs
    bio_i = rebin(bio_i, (bio_i.shape[0] / factor, bio_i.shape[1] / factor))
    bio_var_arrs.append(bio_i)

In [None]:
# Load biomass estimations based on plot data 
biomass = pd.read_csv("biomass_all_20180118.csv")

biomass['X'] = biomass.Longitud.apply(lambda x: int((x - altXOrigin) / altPixelWidth))
biomass['Y'] = biomass.Latitud.apply(lambda y: int((y - altYOrigin) / altPixelHeight))

biopix = biomass.groupby(['X','Y']).size().reset_index().drop(columns=0)

biopix['chaveII'] = np.nan
for row in biopix.itertuples():
    biopix.loc[(biopix.X == row.X) & (biopix.Y == row.Y), 'chaveII'] = \
        biomass[(biomass.X == row.X) & (biomass.Y == row.Y)]['chaveII'].mean()

In [None]:
# Select environmental data from pixels where biomass estimates have been conducted
X = np.empty((biopix.shape[0], (len(prec_arrs) + 1)))
X[:,0] = alt_arr[biopix.Y, biopix.X]
for inx,arr in enumerate(prec_arrs):
    ni = inx + 1
    X[:,ni] = arr[biopix.Y, biopix.X]

In [None]:
# GBT estimator does not accept missing data, therefore cells with missing data will be filled with the variable mean 
imp = Imputer()
imp = imp.fit(X)
X = imp.fit_transform(X)

In [None]:
# Randomly split input and response data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, biopix.chaveII.as_matrix(), test_size=0.2)

In [None]:
# Train the regression model
gbtr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1)
gbtr.fit(X_train, y_train)

In [None]:
# Estimate the r regression coefficient
r2_score(y_test, gbtr.predict(X_test))