In [49]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import dagshub
dagshub.init(repo_owner='Omdena', repo_name='IPage', mlflow=True)

import mlflow

In [50]:
data = pd.read_csv('merged_v4.csv')

data.drop(columns=['longitude', 'latitude', 'SOC', 'Boron'], inplace=True)

# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay']

one_hot_cols = ['Land class', 'Soil type']

target_mean_cols = ['Area', 'Soil group']

target_col = 'Zinc'

In [51]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.target_means_ = {}
    
    def fit(self, X, y):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        self.target_means_ = {
            col: X[col].map(y.groupby(X[col]).mean())
            for col in X.columns
        }
        return self
    
    def transform(self, X):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        X_encoded = X.copy()
        for col in X_encoded.columns:
            X_encoded[col] = X_encoded[col].map(self.target_means_[col])
        return X_encoded.fillna(0)  # Handle unseen categories with 0

In [52]:
# Define hyperparameter distributions for both models
param_dist_rf = {
    'regressor__n_estimators': [5, 10, 15],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': [None, 'sqrt', 'log2'],
    'regressor__bootstrap': [True, False]
}

param_dist_xgb = {
    'regressor__n_estimators': [5, 10, 15],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 6, 9],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.7, 0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 0.8, 1.0],
    'regressor__gamma': [0, 0.1, 0.5],
    'regressor__reg_alpha': [0, 1, 5],
    'regressor__reg_lambda': [1, 5, 10]
}

# Set up the models
models = {
    'RandomForest': (RandomForestRegressor(random_state=0), param_dist_rf),
    'XGB': (XGBRegressor(random_state=0), param_dist_xgb)
}

In [53]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Perform RandomizedSearchCV
best_params = {}
results = {}
# Perform RandomizedSearchCV
for name, (regressor, param_dist) in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # Preprocessing step
        ('regressor', regressor)        # Regressor
    ])
    
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=50,
        scoring=make_scorer(r2_score),
        cv=5,
        n_jobs=-1,
        random_state=0
    )
    
    random_search.fit(X_train, y_train)
    best_params[name] = random_search.best_params_
    results[name] = random_search.cv_results_
    
    print(f"Best parameters for {name}: {random_search.best_params_}")
    print(f"Best score for {name}: {random_search.best_score_}")

Best parameters for RandomForest: {'regressor__n_estimators': 15, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 4, 'regressor__max_features': 'sqrt', 'regressor__max_depth': 20, 'regressor__bootstrap': False}
Best score for RandomForest: 0.43057319199960026
Best parameters for XGB: {'regressor__subsample': 0.7, 'regressor__reg_lambda': 5, 'regressor__reg_alpha': 1, 'regressor__n_estimators': 15, 'regressor__min_child_weight': 1, 'regressor__max_depth': 6, 'regressor__learning_rate': 0.2, 'regressor__gamma': 0, 'regressor__colsample_bytree': 0.7}
Best score for XGB: 0.43196073871162904


In [54]:
models = {
    'RandomForest': RandomForestRegressor(random_state=0,
                                            n_estimators=15,
                                            min_samples_split=2,
                                            min_samples_leaf=4,
                                            max_features='sqrt',
                                            max_depth=20,
                                            bootstrap=False
                                          ),
    'XGB': XGBRegressor(random_state=0,
                        subsample=0.7,
                        reg_lambda=5,
                        reg_alpha=1,
                        n_estimators=15,
                        min_child_weight=1,
                        max_depth=6,
                        learning_rate=0.2,
                        gamma=0,
                        colsample_bytree=0.7
                        )
        }

In [55]:
# Initialize MLflow
mlflow.set_experiment('Taylor_v4_Zinc_Tuned_20241227')
mlflow.autolog()

2024/12/27 16:00:50 INFO mlflow.tracking.fluent: Experiment with name 'Taylor_v4_Zinc_Tuned_20241227' does not exist. Creating a new experiment.
2024/12/27 16:00:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/12/27 16:00:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/12/27 16:00:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [56]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Train and test multiple models
for model_name, model in models.items():

    # Create pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
    ])
        
    # Cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    print(f"Cross-validated R² scores: {scores}")
    print(f"Mean R²: {scores.mean()}")
    
    # Fit and test the pipeline
    pipeline.fit(X_train, y_train)
    print(f"Test R²: {pipeline.score(X_test, y_test)}")

2024/12/27 16:01:39 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5914dcf846fd430ba8a1a147f464c0f2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run dapper-gnat-324 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/5914dcf846fd430ba8a1a147f464c0f2
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11


2024/12/27 16:02:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5ffc6ea1f716423e9af4e56e2e74f099', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run vaunted-wasp-406 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/5ffc6ea1f716423e9af4e56e2e74f099
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11


2024/12/27 16:02:39 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1e76fb20655e4e6e82555afaa0a8205a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run resilient-jay-314 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/1e76fb20655e4e6e82555afaa0a8205a
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11


2024/12/27 16:03:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cc3c57fbf0ba4c47bb6b68b9800bb9fd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run honorable-bird-410 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/cc3c57fbf0ba4c47bb6b68b9800bb9fd
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11


2024/12/27 16:03:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ca61eac822e141e084278c7d71fe5c9a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run overjoyed-hound-220 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/ca61eac822e141e084278c7d71fe5c9a
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11
Cross-validated R² scores: [0.47376549 0.44649166 0.4428368  0.46961087 0.32016114]
Mean R²: 0.43057319199960026


2024/12/27 16:04:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cff939a836ff4f0f9546832c80fba766', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run aged-bird-243 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/cff939a836ff4f0f9546832c80fba766
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11
Test R²: 0.44913896900555617


2024/12/27 16:04:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '446f03a8b98a4a22ac9757029446273b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run nimble-hare-640 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/446f03a8b98a4a22ac9757029446273b
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11


2024/12/27 16:05:11 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'aaa172cef21f4655acd51f9691c12d86', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run unruly-panda-721 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/aaa172cef21f4655acd51f9691c12d86
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11


2024/12/27 16:05:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '57c37799cc914c7ba90d6c32d9ba53e5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run useful-sloth-712 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/57c37799cc914c7ba90d6c32d9ba53e5
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11


2024/12/27 16:06:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b1d0996665b54b7581b09cee412ecee6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run flawless-swan-843 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/b1d0996665b54b7581b09cee412ecee6
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11


2024/12/27 16:06:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '02c3b6355bcf4d6abdeaccb8be9a8c35', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run abundant-crab-803 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/02c3b6355bcf4d6abdeaccb8be9a8c35
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11
Cross-validated R² scores: [0.50356703 0.4411069  0.42268074 0.44960535 0.34284367]
Mean R²: 0.43196073871162904


2024/12/27 16:06:48 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '42d4539a208943b492118ba0bce8af60', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run legendary-gnat-665 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11/runs/42d4539a208943b492118ba0bce8af60
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/11
Test R²: 0.4682369700197494
