In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import dagshub
dagshub.init(repo_owner='Omdena', repo_name='IPage', mlflow=True)

import mlflow

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.target_means_ = {}
    
    def fit(self, X, y):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        self.target_means_ = {
            col: X[col].map(y.groupby(X[col]).mean())
            for col in X.columns
        }
        return self
    
    def transform(self, X):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        X_encoded = X.copy()
        for col in X_encoded.columns:
            X_encoded[col] = X_encoded[col].map(self.target_means_[col])
        return X_encoded.fillna(0)  # Handle unseen categories with 0

In [6]:
data = pd.read_csv('merged_v4.csv')

data.drop(columns=['longitude', 'latitude', 'Soil group', 'Boron', 'Zinc'], inplace=True)

# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay']

one_hot_cols = ['Land class', 'Soil type']

target_mean_col = 'Area'

target_col = 'SOC'

In [7]:
# Define hyperparameter distributions for both models
param_dist_rf = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': [None, 'sqrt', 'log2'],
    'regressor__bootstrap': [True, False]
}

param_dist_xgb = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 6, 9],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.7, 0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 0.8, 1.0],
    'regressor__gamma': [0, 0.1, 0.5],
    'regressor__reg_alpha': [0, 1, 5],
    'regressor__reg_lambda': [1, 5, 10]
}

# Set up the models
models = {
    'RandomForest': (RandomForestRegressor(random_state=0), param_dist_rf),
    'XGB': (XGBRegressor(random_state=0), param_dist_xgb)
}

In [8]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Perform RandomizedSearchCV
best_params = {}
results = {}
# Perform RandomizedSearchCV
for name, (regressor, param_dist) in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # Preprocessing step
        ('regressor', regressor)        # Regressor
    ])
    
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=50,
        scoring=make_scorer(r2_score),
        cv=5,
        n_jobs=-1,
        random_state=0
    )
    
    random_search.fit(X_train, y_train)
    best_params[name] = random_search.best_params_
    results[name] = random_search.cv_results_
    
    print(f"Best parameters for {name}: {random_search.best_params_}")
    print(f"Best score for {name}: {random_search.best_score_}")

Best parameters for RandomForest: {'regressor__n_estimators': 200, 'regressor__min_samples_split': 5, 'regressor__min_samples_leaf': 1, 'regressor__max_features': None, 'regressor__max_depth': 10, 'regressor__bootstrap': True}
Best score for RandomForest: 0.8359795957853903
Best parameters for XGB: {'regressor__subsample': 0.8, 'regressor__reg_lambda': 1, 'regressor__reg_alpha': 0, 'regressor__n_estimators': 100, 'regressor__min_child_weight': 1, 'regressor__max_depth': 6, 'regressor__learning_rate': 0.05, 'regressor__gamma': 0.5, 'regressor__colsample_bytree': 0.8}
Best score for XGB: 0.8367698438829831


In [9]:
models = {
    'RandomForest': RandomForestRegressor(random_state=0,
                                            n_estimators=200,
                                            min_samples_split=15,
                                            min_samples_leaf=1,
                                            max_features=None,
                                            max_depth=10,
                                            bootstrap=True
                                          ),
    'XGB': XGBRegressor(random_state=0,
                        subsample=0.8,
                        reg_lambda=1,
                        reg_alpha=0,
                        n_estimators=100,
                        min_child_weight=1,
                        max_depth=6,
                        learning_rate=0.05,
                        gamma=0.5,
                        colsample_bytree=0.8
                        )
        }

In [10]:
# Initialize MLflow
mlflow.set_experiment('Taylor_v4_SOC_Tuned_20241227')
mlflow.autolog()

2024/12/27 15:56:22 INFO mlflow.tracking.fluent: Experiment with name 'Taylor_v4_SOC_Tuned_20241227' does not exist. Creating a new experiment.
2024/12/27 15:56:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/12/27 15:56:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/12/27 15:56:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [11]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Train and test multiple models
for model_name, model in models.items():

    # Create pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
    ])
        
    # Cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    print(f"Cross-validated R² scores: {scores}")
    print(f"Mean R²: {scores.mean()}")
    
    # Fit and test the pipeline
    pipeline.fit(X_train, y_train)
    print(f"Test R²: {pipeline.score(X_test, y_test)}")

2024/12/27 15:56:48 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '463a95a3539440d5ae2ade83d82034db', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run likeable-finch-587 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/463a95a3539440d5ae2ade83d82034db
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10


2024/12/27 15:56:57 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '94953caf0ead4b3d92f611ab9ec17f21', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run ambitious-newt-764 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/94953caf0ead4b3d92f611ab9ec17f21
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10


2024/12/27 15:57:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e05d0332666e4b439ef918fb244c73d5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run placid-fly-194 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/e05d0332666e4b439ef918fb244c73d5
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10


2024/12/27 15:57:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0a08f1b531ae49d280d45119190e3fb7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run clean-hen-636 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/0a08f1b531ae49d280d45119190e3fb7
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10


2024/12/27 15:57:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '00d0df51ad0a412d94f2490f030970ff', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run marvelous-panda-816 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/00d0df51ad0a412d94f2490f030970ff
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10
Cross-validated R² scores: [0.86735483 0.83526252 0.83883711 0.84228873 0.77836465]
Mean R²: 0.8324215691700523


2024/12/27 15:57:53 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4415e175e38b458788750442049e0a8f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run agreeable-conch-167 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/4415e175e38b458788750442049e0a8f
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10
Test R²: 0.8029266307163244


2024/12/27 15:58:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '12953f2d8afa40f6a32f599504e0568f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run zealous-kite-178 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/12953f2d8afa40f6a32f599504e0568f
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10


2024/12/27 15:58:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '54c39233b252434eb7f7e8782c978278', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run casual-toad-799 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/54c39233b252434eb7f7e8782c978278
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10


2024/12/27 15:58:39 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0b409d9d64fe4a9894447e65faf928a9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run nervous-shark-847 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/0b409d9d64fe4a9894447e65faf928a9
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10


2024/12/27 15:58:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '49a9b2238ac74f518c419987f2b6c06a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run bold-croc-50 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/49a9b2238ac74f518c419987f2b6c06a
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10


2024/12/27 15:59:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '140684e618bb4cf7a90d3a5f01f5d2e1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run colorful-smelt-685 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/140684e618bb4cf7a90d3a5f01f5d2e1
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10
Cross-validated R² scores: [0.85643938 0.83726338 0.83902261 0.84625972 0.80486413]
Mean R²: 0.8367698438829831


2024/12/27 15:59:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c0ffb73e46b042808a87a3f5f688b4e0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run exultant-quail-158 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10/runs/c0ffb73e46b042808a87a3f5f688b4e0
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/10
Test R²: 0.7952064277348081
