In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance

In [27]:
# Load data
ipage = pd.read_csv('./ipage_scaled.csv')
isda = pd.read_csv('./isda_scaled.csv')
isda = isda.dropna()

In [28]:
# Add initial engineered features
ipage['Total Macronutrient'] = ipage['Nitrogen'] + ipage['Phosphorus'] + ipage['Potassium']
ipage['NP Ratio'] = ipage['Nitrogen'] / ipage['Phosphorus']
ipage['NK Ratio'] = ipage['Nitrogen'] / ipage['Potassium']
ipage['KP Ratio'] = ipage['Potassium'] / ipage['Phosphorus']
ipage['Sulfur Macronutrient Ratio'] = ipage['Total Macronutrient'] / ipage['Sulfur']

isda['Total Macronutrient'] = isda['Nitrogen'] + isda['Phosphorus'] + isda['Potassium']
isda['NP Ratio'] = isda['Nitrogen'] / isda['Phosphorus']
isda['NK Ratio'] = isda['Nitrogen'] / isda['Potassium']
isda['KP Ratio'] = isda['Potassium'] / isda['Phosphorus']
isda['Sulfur Macronutrient Ratio'] = isda['Total Macronutrient'] / isda['Sulfur']

In [29]:
# Split datasets
ipage_X = ipage.drop(columns=['SOC', 'Boron', 'Zinc'])
ipage_y = ipage[['SOC', 'Boron', 'Zinc']]
isda_X = isda.drop(columns=['SOC', 'Boron', 'Zinc'])
isda_y = isda[['SOC', 'Boron', 'Zinc']]

# Train/test split
ipage_X_train, ipage_X_test, ipage_y_train, ipage_y_test = train_test_split(ipage_X, ipage_y, test_size=0.2, random_state=0)
isda_X_train, isda_X_test, isda_y_train, isda_y_test = train_test_split(isda_X, isda_y, test_size=0.2, random_state=0)

# Initialize dictionary to store results
results = {}

# Define function for evaluating models
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
    r2 = r2_score(y_true, y_pred, multioutput='variance_weighted')
    
    print(f"{model_name} Performance:")
    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"R² Score: {r2}")
    
    results[model_name] = {'MSE': mse, 'MAE': mae, 'R² Score': r2}
    print("-" * 30)

# Define models to use
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror')
}

# Define datasets
datasets = {
    "ipage": (ipage_X_train, ipage_X_test, ipage_y_train, ipage_y_test),
    "isda": (isda_X_train, isda_X_test, isda_y_train, isda_y_test)
}

def evaluate_datasets(models, datasets, evaluate_model):
    # Loop through datasets
    for dataset_name, (X_train, X_test, y_train, y_test) in datasets.items():
        print(f"Evaluating models on {dataset_name} dataset:")
        
        # Loop through models
        for model_name, model in models.items():
            # Train the model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Evaluate the model
            evaluate_model(y_test, y_pred, f"{model_name} on {dataset_name}")
            
            # Calculate permutation importance
            perm_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
            importance_df = pd.DataFrame({
                'Feature': X_train.columns,
                'Permutation Importance': perm_importance.importances_mean
            }).sort_values(by='Permutation Importance', ascending=False)
            
            print(f"Permutation Importance for {model_name}:")
            print(importance_df)
            print("-" * 30)

In [30]:
evaluate_datasets(models, datasets, evaluate_model)

Evaluating models on ipage dataset:
Linear Regression on ipage Performance:
MSE: [0.54813882 1.09531533 1.09913761]
MAE: [0.54935044 0.75117384 0.78620711]
R² Score: 0.20426030020354866
------------------------------
Permutation Importance for Linear Regression:
                      Feature  Permutation Importance
0                    Nitrogen                0.202157
1                  Phosphorus                0.063551
5         Total Macronutrient                0.061016
2                   Potassium                0.021958
4                          pH                0.015487
3                      Sulfur                0.012116
6                    NP Ratio                0.008247
7                    NK Ratio                0.001122
9  Sulfur Macronutrient Ratio               -0.000032
8                    KP Ratio               -0.001591
------------------------------
Random Forest on ipage Performance:
MSE: [0.49100159 1.01694082 1.11201695]
MAE: [0.54238082 0.73018625 0.735597

## Lets test for only SOC ##

In [31]:
# Split datasets
ipage_SOC_X = ipage.drop(columns=['SOC', 'Boron', 'Zinc'])
ipage_SOC_y = ipage[['SOC']]
isda_SOC_X = isda.drop(columns=['SOC', 'Boron', 'Zinc'])
isda_SOC_y = isda[['SOC']]

# Train/test split
ipage_SOC_X_train, ipage_SOC_X_test, ipage_SOC_y_train, ipage_SOC_y_test = train_test_split(ipage_SOC_X, ipage_SOC_y, test_size=0.2, random_state=0)
isda_SOC_X_train, isda_SOC_X_test, isda_SOC_y_train, isda_SOC_y_test = train_test_split(isda_SOC_X, isda_SOC_y, test_size=0.2, random_state=0)

# Initialize dictionary to store results
results = {}

# Define datasets
datasets = {
    "ipage": (ipage_SOC_X_train, ipage_SOC_X_test, ipage_SOC_y_train, ipage_SOC_y_test),
    "isda": (isda_SOC_X_train, isda_SOC_X_test, isda_SOC_y_train, isda_SOC_y_test)
}

evaluate_datasets(models, datasets, evaluate_model)

Evaluating models on ipage dataset:
Linear Regression on ipage Performance:
MSE: [0.54813882]
MAE: [0.54935044]
R² Score: 0.5118935175720203
------------------------------
Permutation Importance for Linear Regression:
                      Feature  Permutation Importance
0                    Nitrogen                0.574928
1                  Phosphorus                0.176443
5         Total Macronutrient                0.115220
2                   Potassium                0.022008
7                    NK Ratio                0.004606
6                    NP Ratio                0.004377
9  Sulfur Macronutrient Ratio                0.000003
8                    KP Ratio               -0.001005
4                          pH               -0.002756
3                      Sulfur               -0.008980
------------------------------
Random Forest on ipage Performance:
MSE: [0.52007793]
MAE: [0.54314175]
R² Score: 0.5368811691108668
------------------------------


  return fit_method(estimator, *args, **kwargs)


Permutation Importance for Random Forest:
                      Feature  Permutation Importance
0                    Nitrogen                0.956418
1                  Phosphorus                0.021405
2                   Potassium                0.004605
4                          pH                0.002224
9  Sulfur Macronutrient Ratio               -0.001357
5         Total Macronutrient               -0.002676
3                      Sulfur               -0.003314
7                    NK Ratio               -0.006698
8                    KP Ratio               -0.007307
6                    NP Ratio               -0.010308
------------------------------
XGBoost on ipage Performance:
MSE: [0.64041816]
MAE: [0.58140446]
R² Score: 0.4297206442381943
------------------------------
Permutation Importance for XGBoost:
                      Feature  Permutation Importance
0                    Nitrogen                0.879115
2                   Potassium                0.016240
8        

  return fit_method(estimator, *args, **kwargs)


Random Forest on isda Performance:
MSE: [0.06457244]
MAE: [0.16238077]
R² Score: 0.7261976307420726
------------------------------
Permutation Importance for Random Forest:
                      Feature  Permutation Importance
0                    Nitrogen                1.320100
2                   Potassium                0.010031
7                    NK Ratio                0.002195
3                      Sulfur                0.001739
6                    NP Ratio               -0.000428
9  Sulfur Macronutrient Ratio               -0.001338
1                  Phosphorus               -0.002792
8                    KP Ratio               -0.003989
4                          pH               -0.004680
5         Total Macronutrient               -0.010160
------------------------------
XGBoost on isda Performance:
MSE: [0.07762127]
MAE: [0.16927748]
R² Score: 0.6708675319395047
------------------------------
Permutation Importance for XGBoost:
                      Feature  Permutatio

## Now lets use SOC as part of the training data to help predict Boron and Zinc ##

In [32]:
# Split datasets
ipage_BZ_X = ipage.drop(columns=['Boron', 'Zinc'])
ipage_BZ_y = ipage[['Boron', 'Zinc']]
isda_BZ_X = isda.drop(columns=['Boron', 'Zinc'])
isda_BZ_y = isda[['Boron', 'Zinc']]

#SOC specific features
ipage_BZ_X['Macronutrient SOC Ratio'] = ipage['Total Macronutrient'] / ipage_BZ_X['SOC']
ipage_BZ_X['Sulfur SOC Ratio'] = ipage_BZ_X['Sulfur'] / ipage_BZ_X['SOC']
ipage_BZ_X['pH SOC Ratio'] = ipage_BZ_X['pH'] / ipage_BZ_X['SOC']
isda_BZ_X['Macronutrient SOC Ratio'] = isda['Total Macronutrient'] / isda_BZ_X['SOC']
isda_BZ_X['Sulfur SOC Ratio'] = isda_BZ_X['Sulfur'] / isda_BZ_X['SOC']
isda_BZ_X['pH SOC Ratio'] = isda_BZ_X['pH'] / isda_BZ_X['SOC']


# Train/test split
ipage_BZ_X_train, ipage_BZ_X_test, ipage_BZ_y_train, ipage_BZ_y_test = train_test_split(ipage_BZ_X, ipage_BZ_y, test_size=0.2, random_state=0)
isda_BZ_X_train, isda_BZ_X_test, isda_BZ_y_train, isda_BZ_y_test = train_test_split(isda_BZ_X, isda_BZ_y, test_size=0.2, random_state=0)

# Initialize dictionary to store results
results = {}

# Define datasets
datasets = {
    "ipage": (ipage_BZ_X_train, ipage_BZ_X_test, ipage_BZ_y_train, ipage_BZ_y_test),
    "isda": (isda_BZ_X_train, isda_BZ_X_test, isda_BZ_y_train, isda_BZ_y_test)
}

evaluate_datasets(models, datasets, evaluate_model)

Evaluating models on ipage dataset:
Linear Regression on ipage Performance:
MSE: [1.09445843 1.05201335]
MAE: [0.75736254 0.75594515]
R² Score: 0.076231616047518
------------------------------
Permutation Importance for Linear Regression:
                       Feature  Permutation Importance
3                          SOC                0.131996
0                     Nitrogen                0.117318
6          Total Macronutrient                0.039578
2                    Potassium                0.035186
1                   Phosphorus                0.024348
5                           pH                0.023310
4                       Sulfur                0.020905
7                     NP Ratio                0.006882
11     Macronutrient SOC Ratio                0.005039
10  Sulfur Macronutrient Ratio               -0.000062
12            Sulfur SOC Ratio               -0.000539
8                     NK Ratio               -0.000734
9                     KP Ratio               -

**While it does appear that some of the engineered features used have some predictive power,
I want to experiment with some of the categorical features. I'm thinking of using PCA and K means to combine all the categorical and numerical features in to a few categories so that I can then potentially drop the existing categorical variables to reduce dimensionality. I'd also like to change soil(knit) down in to its constituent components (sand, silt, clay) and potentially combine some of them with the numerical feetures depending on what is scientifically appropriate**