In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import dagshub
dagshub.init(repo_owner='Omdena', repo_name='IPage', mlflow=True)

import mlflow

In [5]:
data = pd.read_csv('merged_v4.csv')

data.drop(columns=['longitude', 'latitude', 'Soil group', 'SOC', 'Zinc'], inplace=True)

# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay']

one_hot_cols = ['Land class', 'Soil type']

target_mean_col = 'Area'

target_col = 'Boron'

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.target_means_ = {}
    
    def fit(self, X, y):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        self.target_means_ = {
            col: X[col].map(y.groupby(X[col]).mean())
            for col in X.columns
        }
        return self
    
    def transform(self, X):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        X_encoded = X.copy()
        for col in X_encoded.columns:
            X_encoded[col] = X_encoded[col].map(self.target_means_[col])
        return X_encoded.fillna(0)  # Handle unseen categories with 0

In [7]:
# Define hyperparameter distributions for both models
param_dist_rf = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': [None, 'sqrt', 'log2'],
    'regressor__bootstrap': [True, False]
}

param_dist_xgb = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 6, 9],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.7, 0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 0.8, 1.0],
    'regressor__gamma': [0, 0.1, 0.5],
    'regressor__reg_alpha': [0, 1, 5],
    'regressor__reg_lambda': [1, 5, 10]
}

# Set up the models
models = {
    'RandomForest': (RandomForestRegressor(random_state=0), param_dist_rf),
    'XGB': (XGBRegressor(random_state=0), param_dist_xgb)
}

In [8]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Perform RandomizedSearchCV
best_params = {}
results = {}
# Perform RandomizedSearchCV
for name, (regressor, param_dist) in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # Preprocessing step
        ('regressor', regressor)        # Regressor
    ])
    
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=50,
        scoring=make_scorer(r2_score),
        cv=5,
        n_jobs=-1,
        random_state=0
    )
    
    random_search.fit(X_train, y_train)
    best_params[name] = random_search.best_params_
    results[name] = random_search.cv_results_
    
    print(f"Best parameters for {name}: {random_search.best_params_}")
    print(f"Best score for {name}: {random_search.best_score_}")

Best parameters for RandomForest: {'regressor__n_estimators': 200, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 4, 'regressor__max_features': None, 'regressor__max_depth': 20, 'regressor__bootstrap': True}
Best score for RandomForest: 0.7174628393030688
Best parameters for XGB: {'regressor__subsample': 0.7, 'regressor__reg_lambda': 5, 'regressor__reg_alpha': 0, 'regressor__n_estimators': 100, 'regressor__min_child_weight': 5, 'regressor__max_depth': 3, 'regressor__learning_rate': 0.1, 'regressor__gamma': 0, 'regressor__colsample_bytree': 0.7}
Best score for XGB: 0.7132400983668158


In [9]:
models = {
    'RandomForest': RandomForestRegressor(random_state=0,
                                            n_estimators=200,
                                            min_samples_split=10,
                                            min_samples_leaf=4,
                                            max_features=None,
                                            max_depth=20,
                                            bootstrap=True
                                          ),
    'XGB': XGBRegressor(random_state=0,
                        subsample=0.7,
                        reg_lambda=5,
                        reg_alpha=0,
                        n_estimators=100,
                        min_child_weight=5,
                        max_depth=3,
                        learning_rate=0.1,
                        gamma=0,
                        colsample_bytree=0.7
                        )
        }

In [10]:
# Initialize MLflow
mlflow.set_experiment('Taylor_v4_Boron_Tuned_20241227')
mlflow.autolog()

2024/12/27 16:01:08 INFO mlflow.tracking.fluent: Experiment with name 'Taylor_v4_Boron_Tuned_20241227' does not exist. Creating a new experiment.
2024/12/27 16:01:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/12/27 16:01:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/12/27 16:01:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [11]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Train and test multiple models
for model_name, model in models.items():

    # Create pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
    ])
        
    # Cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    print(f"Cross-validated R² scores: {scores}")
    print(f"Mean R²: {scores.mean()}")
    
    # Fit and test the pipeline
    pipeline.fit(X_train, y_train)
    print(f"Test R²: {pipeline.score(X_test, y_test)}")

2024/12/27 16:01:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cd1fdbe1a5e34e1eab09db531b259dca', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run rebellious-bee-569 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/cd1fdbe1a5e34e1eab09db531b259dca
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12


2024/12/27 16:01:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'eebe84d82e6d45e79a1d74caad3cf572', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run abrasive-midge-497 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/eebe84d82e6d45e79a1d74caad3cf572
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12


2024/12/27 16:01:30 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8d20ca7f94ac49eba2925a8dac25c417', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run burly-bird-245 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/8d20ca7f94ac49eba2925a8dac25c417
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12


2024/12/27 16:01:52 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2f7182a9cf054855bdd87043d04b9192', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run youthful-boar-809 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/2f7182a9cf054855bdd87043d04b9192
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12


2024/12/27 16:02:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'fd2edd0888e04a43bec6453d06b9bf43', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run flawless-cow-27 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/fd2edd0888e04a43bec6453d06b9bf43
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12
Cross-validated R² scores: [0.73864585 0.74583163 0.69945988 0.7199129  0.68346394]
Mean R²: 0.7174628393030688


2024/12/27 16:02:52 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ea5c007b372a4faeb4dda7e4e9db65ab', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run polite-squirrel-760 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/ea5c007b372a4faeb4dda7e4e9db65ab
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12
Test R²: 0.6971291661061492


2024/12/27 16:03:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '07ba1fdc27c947569c1542addfbb3301', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run angry-finch-183 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/07ba1fdc27c947569c1542addfbb3301
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12


2024/12/27 16:03:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a70f1d6bc1e14f56a140bbc7e22aec06', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run blushing-koi-332 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/a70f1d6bc1e14f56a140bbc7e22aec06
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12


2024/12/27 16:04:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8aeda9d1581e4518ae4da07ebd363c0d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run omniscient-elk-632 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/8aeda9d1581e4518ae4da07ebd363c0d
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12


2024/12/27 16:04:55 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '08cc5f0bbb6e451087f56356a6cd49ec', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run thundering-deer-233 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/08cc5f0bbb6e451087f56356a6cd49ec
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12


2024/12/27 16:05:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'eca56cccae01422eb509c8fa45add0d9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run defiant-duck-829 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/eca56cccae01422eb509c8fa45add0d9
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12
Cross-validated R² scores: [0.72787508 0.73771187 0.70404866 0.71692936 0.67963552]
Mean R²: 0.7132400983668158


2024/12/27 16:05:55 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'fb0c06d098424a8aa8b8c6761a457fc7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run learned-kit-466 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12/runs/fb0c06d098424a8aa8b8c6761a457fc7
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/12
Test R²: 0.7047904753270067
