In [2]:
import os

# os.environ["OPENBLAS_NUM_THREADS"] = "4"

# import seaborn as sns
import pandas as pd

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import  XGBRegressor

In [3]:
df = pd.read_csv('../../../../soil-prediction/iPAGE SoilData.csv')

# df = df.drop('Data Collection Year', axis=1)
df = df.fillna('UNK')

## filter out outliers (get this from visualization)

df = df[df['SOC (%)'] < 10]
df = df[df['Nitrogen N (%)'] < 0.3]
df = df[df['Potassium K (meq/100)'] < 10]

target_cols = ['SOC (%)', 'Boron B (ug/g)', 'Zinc Zn (ug/g)']

In [4]:
df.columns

Index(['Area', 'Data Collection Year', 'soil group', 'Land class',
       'knit (surface)', 'pH', 'SOC (%)', 'Nitrogen N (%)',
       'Potassium K (meq/100)', 'Phosphorus P (ug/g)', 'Sulfur S (ug/g)',
       'Boron B (ug/g)', 'Zinc Zn (ug/g)'],
      dtype='object')

In [5]:
num_cols = ['pH', 'Nitrogen N (%)', 'Potassium K (meq/100)', 'Phosphorus P (ug/g)', 'Sulfur S (ug/g)']
cat_cols = ['Area', 'soil group', 'Land class', 'knit (surface)']

In [6]:
# df.head()
train_df = df[df['Data Collection Year']<2016]
test_df = df[df['Data Collection Year']>=2016]

train_df = train_df.drop('Data Collection Year', axis=1)
test_df = test_df.drop('Data Collection Year', axis=1)

print('total train samples:', len(train_df))
print('total test samples:', len(test_df))

train_labels = train_df[target_cols]
train_features = train_df.drop(target_cols, axis=1)

test_labels = test_df[target_cols]
test_features = test_df.drop(target_cols, axis=1)

total train samples: 514
total test samples: 99


In [7]:
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

cat_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

col_transformer = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_cols),
            ('cat', cat_transformer, cat_cols)
        ],
        remainder= 'passthrough'
    )

In [8]:
col_transformer.fit(train_df)

# col_names = list(col_transformer.get_feature_names_out())

# col_names = [s.replace('cat__','').replace('num__','').replace('remainder__','') for s in col_names]

train_data = col_transformer.transform(train_df).toarray()
test_data = col_transformer.transform(test_df).toarray()

# train_df = pd.DataFrame(data=train_data, columns=col_names)
# test_df = pd.DataFrame(data=test_data, columns=col_names)

In [9]:
train_labels.columns

Index(['SOC (%)', 'Boron B (ug/g)', 'Zinc Zn (ug/g)'], dtype='object')

In [11]:
alpha = [1, 5, 10]
solver = ['sparse_cg']
l1_ratio = [0.3, 0.5, 0.7, 1.0]
max_iter = [100, 500, 1000]
n_estimator = [100,300,500]
learning_rate = [0.1, 0.5, 1.0, 5.0]
loss_func = ['linear', 'square']
min_samples_split = [2, 3, 5, 7]
min_samples_leaf = [1, 3, 5, 7]

search_params = {
    'ridge': {
        'alpha': alpha,
        'max_iter': max_iter
    },
    'lasso': {
        'alpha': alpha,
        'max_iter': max_iter
    },
    'elasticNet': {
        'alpha': alpha,
        'max_iter': max_iter,
        'l1_ratio': l1_ratio
    },
    'adaboost': {
        'n_estimators': n_estimator,
        'learning_rate': learning_rate,
        'loss': loss_func
    },
    'bagging': {
        'n_estimators': n_estimator
    },
    'rf': {
        'n_estimators': n_estimator,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
    },
    'xgb': {
        'n_estimators': n_estimator,
        'learning_rate': learning_rate
    }
}

models = {
    'ridge': Ridge(random_state=0), 
    'lasso': Lasso(random_state=0), 
    'elasticNet': ElasticNet(random_state=0), 
    'adaboost': AdaBoostRegressor(random_state=0), 
    'bagging': BaggingRegressor(random_state=0), 
    'rf': RandomForestRegressor(random_state=0), 
    'xgb': XGBRegressor(random_state=0)
}

model_list = list(models.keys())


In [13]:
def train_eval_model(target_col_name):

    for model_name in model_list:
        model = models[model_name]
        params = search_params[model_name]

        print('fitting model', model_name)

        gs = GridSearchCV(
            model,
            param_grid=params,
            scoring='neg_root_mean_squared_error',
            n_jobs=4,
            cv=10,
            verbose = True
        )

        gs.fit(train_data, train_labels[target_col_name])

        print(gs.best_estimator_)

        pred = gs.predict(test_data)

        r2 = r2_score(test_labels[target_col_name], pred)
        mae = mean_absolute_error(test_labels[target_col_name], pred)
        mse = mean_squared_error(test_labels[target_col_name], pred)

        print('R2:', round(r2,2))
        print('MAE:', round(mae,2))
        print('MSE:', round(mse,2))

        print('-'*30)


In [14]:
train_eval_model('SOC (%)')

fitting model ridge
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Ridge(alpha=1, max_iter=100, random_state=0)
R2: 1.0
MAE: 0.0
MSE: 0.0
------------------------------
fitting model lasso
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Lasso(alpha=1, max_iter=100, random_state=0)
R2: -0.02
MAE: 0.59
MSE: 0.5
------------------------------
fitting model elasticNet
Fitting 10 folds for each of 36 candidates, totalling 360 fits
ElasticNet(alpha=1, l1_ratio=0.3, max_iter=100, random_state=0)
R2: 0.38
MAE: 0.46
MSE: 0.31
------------------------------
fitting model adaboost
Fitting 10 folds for each of 24 candidates, totalling 240 fits
AdaBoostRegressor(loss='square', n_estimators=500, random_state=0)
R2: 1.0
MAE: 0.02
MSE: 0.0
------------------------------
fitting model bagging
Fitting 10 folds for each of 3 candidates, totalling 30 fits
BaggingRegressor(n_estimators=100, random_state=0)
R2: 1.0
MAE: 0.01
MSE: 0.0
------------------------------
fitting model rf

Traceback (most recent call last):
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 581, in root_mean_squared_error
    mean_squared_error(
  File "/Users/cpor00

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.5, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=500, n_jobs=None,
             num_parallel_tree=None, random_state=0, ...)
R2: 1.0
MAE: 0.03
MSE: 0.0
------------------------------


In [15]:
train_eval_model('Boron B (ug/g)')

fitting model ridge
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Ridge(alpha=1, max_iter=1000, random_state=0)
R2: 0.99
MAE: 0.03
MSE: 0.0
------------------------------
fitting model lasso
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Lasso(alpha=1, max_iter=100, random_state=0)
R2: -0.15
MAE: 0.35
MSE: 0.28
------------------------------
fitting model elasticNet
Fitting 10 folds for each of 36 candidates, totalling 360 fits
ElasticNet(alpha=1, l1_ratio=0.3, max_iter=100, random_state=0)
R2: -0.15
MAE: 0.35
MSE: 0.28
------------------------------
fitting model adaboost
Fitting 10 folds for each of 24 candidates, totalling 240 fits
AdaBoostRegressor(loss='square', n_estimators=500, random_state=0)
R2: 0.93
MAE: 0.05
MSE: 0.02
------------------------------
fitting model bagging
Fitting 10 folds for each of 3 candidates, totalling 30 fits
BaggingRegressor(n_estimators=300, random_state=0)
R2: 0.89
MAE: 0.05
MSE: 0.03
------------------------------
fitting

Traceback (most recent call last):
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 581, in root_mean_squared_error
    mean_squared_error(
  File "/Users/cpor00

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=500, n_jobs=None,
             num_parallel_tree=None, random_state=0, ...)
R2: 0.95
MAE: 0.04
MSE: 0.01
------------------------------


In [16]:
train_eval_model('Zinc Zn (ug/g)')

fitting model ridge
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Ridge(alpha=1, max_iter=100, random_state=0)
R2: 1.0
MAE: 0.01
MSE: 0.0
------------------------------
fitting model lasso
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Lasso(alpha=1, max_iter=100, random_state=0)
R2: -0.12
MAE: 0.77
MSE: 2.88
------------------------------
fitting model elasticNet
Fitting 10 folds for each of 36 candidates, totalling 360 fits
ElasticNet(alpha=1, l1_ratio=0.3, max_iter=100, random_state=0)
R2: 0.32
MAE: 0.6
MSE: 1.74
------------------------------
fitting model adaboost
Fitting 10 folds for each of 24 candidates, totalling 240 fits
AdaBoostRegressor(loss='square', n_estimators=300, random_state=0)
R2: 0.71
MAE: 0.22
MSE: 0.76
------------------------------
fitting model bagging
Fitting 10 folds for each of 3 candidates, totalling 30 fits
BaggingRegressor(n_estimators=100, random_state=0)
R2: 0.73
MAE: 0.18
MSE: 0.68
------------------------------
fitting mod

Traceback (most recent call last):
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 581, in root_mean_squared_error
    mean_squared_error(
  File "/Users/cpor00

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=0, ...)
R2: 0.75
MAE: 0.17
MSE: 0.64
------------------------------


Traceback (most recent call last):
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/Users/cpor0006/anaconda3/envs/for-ml/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 581, in root_mean_squared_error
    mean_squared_error(
  File "/Users/cpor00