<a href="https://colab.research.google.com/github/margotgeerts/geoRF/blob/main/Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Air temperature example
This example shows how to train a geoRF regression model on data containing spatial features. In this example, the air temperature data is used.

In [1]:
!git clone https://github.com/margotgeerts/geoRF.git

Cloning into 'geoRF'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 16 (delta 1), reused 16 (delta 1), pack-reused 0[K
Receiving objects: 100% (16/16), 14.37 KiB | 2.87 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [1]:
%cd geoRF

/content/geoRF


In [3]:
!pip install -r requirements.txt

Collecting distgfs==1.1.0 (from -r requirements.txt (line 4))
  Downloading distgfs-1.1.0-py3-none-any.whl (23 kB)
Collecting geopandas==0.14.4 (from -r requirements.txt (line 5))
  Downloading geopandas-0.14.4-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib==1.4.0 (from -r requirements.txt (line 6))
  Downloading joblib-1.4.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.2/301.2 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib==3.8.4 (from -r requirements.txt (line 8))
  Downloading matplotlib-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m109.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numba==0.59.1 (from -r requirements.txt (line 9))
  Downloading numba-0.59.1-cp310-cp310-manylinu

In [4]:
import io
import requests
from urllib import request
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Get temperature data
url = 'https://springernature.figshare.com/ndownloader/files/12609182'
url_open = request.urlopen(url)
df = pd.read_csv(io.StringIO(url_open.read().decode('utf-8')))
df = df[['Lat','Lon','meanT','meanP']]
df.rename(columns={"Lat":"y","Lon":"x"},inplace=True)

# Convert X-Y coordinates to a projected coordinate system
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y, crs='EPSG:4326'))

gdf = gdf.to_crs('EPSG:3857')

df['x'] = gdf.geometry.x
df['y'] = gdf.geometry.y

# Split data into train and test
data_train, data_test = train_test_split(df, test_size=0.3, shuffle=True, random_state=0)

# Scale the target value
scaler = MinMaxScaler()
data_train['meanT'] = scaler.fit_transform(data_train['meanT'].values.reshape(-1,1))
data_test['meanT'] = scaler.transform(data_test['meanT'].values.reshape(-1,1))

In [5]:
target = 'meanT'
X_train = data_train.drop([target], axis=1).values
y_train = data_train[target].values

X_test = data_test.drop([target], axis=1).values
y_test = data_test[target].values

In [6]:
# Check the X-Y coordinates (column indices are required for geoRF)
X_train[:,[0,1]]

array([[  5404732.35002346, -10386319.99804493],
       [  4964404.36013148, -11186896.37998292],
       [  4162440.73969107, -11026473.86180073],
       ...,
       [  5237970.70341953, -11431888.31532075],
       [ -4184284.27482761,  16414058.91746819],
       [  7156910.262561  ,   3075757.53061815]])

In [7]:
X_train.shape

(2153, 3)

In [8]:
from geoRF import GeoRFRegressor
georf_temp = GeoRFRegressor(n_estimators=100, max_features=None, n_jobs=-1, random_state=0)
georf_temp.fit(X_train,
               y_train,
               geo_features=[0,1], # X-Y column indices
               gens='da')  # Dual Annealing (DA) geospatial split generator

100%|██████████| 100/100 [46:36<00:00, 27.97s/it]


In [9]:
from sklearn.metrics import mean_squared_error
print(f"train rmse: {mean_squared_error(y_train, georf_temp.predict(X_train), squared=False)}")
print(f"test rmse: {mean_squared_error(y_test, georf_temp.predict(X_test), squared=False)}")



train rmse: 0.014036317425465641
test rmse: 0.03631634852464978




In [10]:
from sklearn.ensemble import RandomForestRegressor
rf_temp = RandomForestRegressor(n_estimators=100, max_features=None, n_jobs=-1, random_state=0)
rf_temp.fit(X_train, y_train)

In [11]:
print(f"train rmse: {mean_squared_error(y_train, rf_temp.predict(X_train), squared=False)}")
print(f"test rmse: {mean_squared_error(y_test, rf_temp.predict(X_test), squared=False)}")

train rmse: 0.015041919569707926
test rmse: 0.0405312127061455


