In [1]:
# Standard and GIS Modules
import os
import numpy as np
import pandas as pd
import time
import scipy
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# import sprf package
from sprf.spatial_random_forest import SpatialRandomForest
from sprf.geographical_random_forest import GeographicalRandomForest

# constants:
dataset_target = {
    "plants": "richness_species_vascular",
    "meuse": "zinc",
    "atlantic": "Rate",
    "deforestation": "deforestation_quantile",
    "california_housing": "median_house_value",
}

In [2]:
print(f"Set dataset here: Must be one of {list(dataset_target.keys())}")
dataset = "plants"

Set dataset here: Must be one of ['plants', 'meuse', 'atlantic', 'deforestation', 'california_housing']


### Load data

In [3]:
data = pd.read_csv(os.path.join("data", dataset+".csv"))
print("samples: ", len(data))
target = dataset_target[dataset]

samples:  227


### Split into train and test

In [4]:
def prepare_data(data, target, lon="x", lat="y"):
    """Assumes that all other columns are used as covariates"""
    covariates = [col for col in data.columns if col not in [lon, lat, target]]
    return data[covariates], data[target], data[[lon, lat]]

In [5]:
inds = np.random.permutation(len(data))
split = int(0.9* len(data))
train_data = data.iloc[inds[:split]]
test_data = data.iloc[inds[split:]]

# split into x, y and coordinates
train_x, train_y, train_coords = prepare_data(
    train_data, target
)
test_x, test_y, test_coords = prepare_data(
    test_data, target
)

### Train and test basic random forest

In [6]:
n_estimators = 100 # can take quite long for Geographical RF
max_depth = 10
spatial_neighbors = len(data) // 5

In [7]:
rf = RandomForestRegressor(max_depth=max_depth)
rf.fit(train_x, train_y)
test_pred = rf.predict(test_x)

rmse_rf = mean_squared_error(test_pred, test_y, squared=False)
print("Error of basic Random Forest: ", rmse_rf)

Error of basic Random Forest:  1233.113304527673


### Train and test spatial random forest

In [8]:
sp = SpatialRandomForest(
    max_depth=max_depth, neighbors=spatial_neighbors
)
sp.fit(train_x, train_y, train_coords)
test_pred = sp.predict(test_x, test_coords)

rmse_spatial_rf = mean_squared_error(test_pred, test_y, squared=False)
print("Error of spatial Random Forest: ", rmse_spatial_rf)

Error of spatial Random Forest:  1494.09398242311


### Train and test geographical random forest

In [9]:
geo_rf = GeographicalRandomForest(
    n_estimators=n_estimators, neighbors=spatial_neighbors, max_depth=max_depth
)
geo_rf.fit(train_x, train_y, train_coords)
test_pred = geo_rf.predict(test_x, test_coords)

rmse_geo_rf = mean_squared_error(test_pred, test_y, squared=False)
print("Error of spatial Random Forest: ", rmse_geo_rf)

Error of spatial Random Forest:  1326.3920442520464


### Tune number of neighbors

In [10]:
regr = SpatialRandomForest(
        n_estimators=n_estimators, neighbors=500, max_depth=max_depth
    )
regr.tune_neighbors(train_x, train_y, train_coords)
print("spatial rf tuned:", regr.neighbors)
regr.fit(train_x, train_y, train_coords)
test_pred = regr.predict(test_x, test_coords)
rmse_spatial_rf_tuned = mean_squared_error(test_pred, test_y, squared=False)
print("Error of tuned spatial Random Forest: ", rmse_spatial_rf_tuned)

spatial rf tuned: 74
Error of tuned spatial Random Forest:  1452.5026591622952
