In [1]:
# Standard and GIS Modules
import os
import sys
import numpy as np
import pandas as pd
import time
import scipy

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sprf.spatial_random_forest import SpatialRandomForest

## Methods for cross validation

In [4]:
def get_folds(nr_samples, nr_folds = 10):
    fold_inds = np.random.permutation(nr_samples)
    num_per_fold = nr_samples//nr_folds
    train_inds, test_inds = [], []
    for i in range(nr_folds):
#         print("start, end", i*num_per_fold)
        if i < nr_folds-1:
            test_inds_fold = np.arange(i*num_per_fold, (i+1)*num_per_fold, 1)
        else:
            test_inds_fold = np.arange(i*num_per_fold, nr_samples)
        test_inds.append(fold_inds[test_inds_fold])
        train_inds.append(np.delete(fold_inds, test_inds_fold))
    return train_inds, test_inds

In [5]:
def prepare_data(data, target, lon="x", lat="y"):
    """Assumes that all other columns are used as covariates"""
    covariates = [col for col in data.columns if col not in [lon, lat, target]]
    return data[covariates], data[target], data[[lon, lat]]

In [6]:
def add_metrics(test_pred, test_y, res_dict_init, method):
    res_dict = res_dict_init.copy()
    res_dict["method"] = method
    res_dict["MSE"] = mean_squared_error(test_pred, test_y)
    res_dict["MAE"] = mean_absolute_error(test_pred, test_y)
    return res_dict

In [7]:
def cross_validation(data):
    nr_folds = 10
    train_inds, test_inds = get_folds(len(data), nr_folds = nr_folds)
    res_df = []

    # dataset specific information
    target = dataset_target[DATASET]
    x_coord_name = dataset_x.get(DATASET, "x")
    y_coord_name = dataset_y.get(DATASET, "y")

    # model params --> TODO: grid search
    max_depth = 10
    spatial_neighbors = len(data) // 5 # one fifth of the dataset
    print("Number of neighbors considered for spatial RF:", spatial_neighbors)

    for fold in range(nr_folds):
        res_dict_init = {"fold": fold, "max_depth": max_depth}
        train_data = data.iloc[train_inds[fold]]
        test_data = data.iloc[test_inds[fold]]
        train_x, train_y, train_coords = prepare_data(train_data, target, x_coord_name, y_coord_name)
        test_x, test_y, test_coords = prepare_data(test_data, target, x_coord_name, y_coord_name)
    #     print(train_x.shape, train_y.shape, train_coords.shape, test_x.shape, test_y.shape, test_coords.shape)

        # Method 1: global RF
        rf = RandomForestRegressor(max_depth=max_depth)
        rf.fit(train_x, train_y)
        test_pred = rf.predict(test_x)
        res_df.append(add_metrics(test_pred, test_y, res_dict_init, "global_rf"))

        # Method 2: global RF with coordinates
        rf = RandomForestRegressor(max_depth=max_depth)
        rf.fit(train_x.join(train_coords), train_y)
        test_pred = rf.predict(test_x.join(test_coords))
        res_df.append(add_metrics(test_pred, test_y, res_dict_init, "coord_rf"))

        # Method 2: linear regression
        lr = LinearRegression()
        lr.fit(train_x, train_y)
        test_pred = lr.predict(test_x)
        res_df.append(add_metrics(test_pred, test_y, res_dict_init, "linear_regression"))

        # Method 3: Spatial RF:
        sp = SpatialRandomForest(max_depth=max_depth, neighbors = spatial_neighbors)
        sp.fit(train_x, train_y, train_coords)
        test_pred = sp.predict(test_x, test_coords)
        res_df.append(add_metrics(test_pred, test_y, res_dict_init, "spatial_rf"))

    res_df = pd.DataFrame(res_df)
    res_all = res_df.groupby(["method"]).agg({"MSE": "mean", "MAE": "mean"}).sort_values("MSE")
    return res_all

## Run analysis for all datasets

In [8]:
for DATASET in ["plants", "atlantic", "deforestation"]:
    print("\nDATASET", DATASET, "\n")
    dataset_target = {
        "plants" : "richness_species_vascular",
        "atlantic": "Rate",
        "deforestation": "deforestation_quantile"
    }
    dataset_x = {} # per default: x
    dataset_y = {} # per default: y
    data_path = os.path.join("data", DATASET+".csv")

    data = pd.read_csv(data_path)
    
    results= cross_validation(data)
    print(results)
    print("--------------")


DATASET plants 

Number of neighbors considered for spatial RF: 45
                            MSE          MAE
method                                      
global_rf          4.948774e+06  1323.998968
coord_rf           5.022293e+06  1341.746195
linear_regression  5.391265e+06  1578.360635
spatial_rf         6.625616e+06  1767.389305
--------------

DATASET atlantic 

Number of neighbors considered for spatial RF: 133
                         MSE       MAE
method                                
coord_rf           53.328222  5.558439
global_rf          57.127183  5.864575
spatial_rf         69.166889  6.369350
linear_regression  74.438131  6.654089
--------------

DATASET deforestation 

Number of neighbors considered for spatial RF: 483
                        MSE       MAE
method                               
coord_rf           0.438697  0.496459
global_rf          0.443818  0.500929
spatial_rf         0.506337  0.559180
linear_regression  0.680674  0.665348
--------------
