In [89]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import dagshub
dagshub.init(repo_owner='Omdena', repo_name='IPage', mlflow=True)

import mlflow

In [90]:
# Initialize MLflow
mlflow.set_experiment('Taylor_v3_SOC_Baseline_20241225')
mlflow.autolog()

2024/12/27 15:31:15 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/12/27 15:31:15 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/12/27 15:31:15 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [45]:
data = pd.read_csv('merged_v3.csv')

data.info()
data['Area'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   longitude   2584 non-null   float64
 1   latitude    2584 non-null   float64
 2   Area        2584 non-null   object 
 3   Soil group  2584 non-null   object 
 4   Land class  2584 non-null   object 
 5   Soil type   2584 non-null   object 
 6   pH          2584 non-null   float64
 7   SOC         2584 non-null   float64
 8   Nitrogen    2584 non-null   float64
 9   Potassium   2584 non-null   float64
 10  Phosphorus  2584 non-null   float64
 11  Sulfur      2584 non-null   float64
 12  Boron       2584 non-null   float64
 13  Zinc        2584 non-null   float64
 14  Sand        2584 non-null   float64
 15  Silt        2584 non-null   float64
 16  Clay        2584 non-null   float64
dtypes: float64(13), object(4)
memory usage: 343.3+ KB


array(['Mithpukur', 'Pirgacha ', 'Gangachara', 'Kaunia upazila',
       'Taraganj Thana', 'Bauchi', 'Taraba', 'Plateau', 'Kaduna',
       'Nasarawa', 'Niger', 'Kebbi', 'Kano', 'Kwara', 'Katsina',
       'Adamawa'], dtype=object)

In [46]:
data.drop(columns=['longitude', 'latitude', 'Soil group', 'Boron', 'Zinc'], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        2584 non-null   object 
 1   Land class  2584 non-null   object 
 2   Soil type   2584 non-null   object 
 3   pH          2584 non-null   float64
 4   SOC         2584 non-null   float64
 5   Nitrogen    2584 non-null   float64
 6   Potassium   2584 non-null   float64
 7   Phosphorus  2584 non-null   float64
 8   Sulfur      2584 non-null   float64
 9   Sand        2584 non-null   float64
 10  Silt        2584 non-null   float64
 11  Clay        2584 non-null   float64
dtypes: float64(9), object(3)
memory usage: 242.4+ KB


In [47]:
# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay']

categorical_cols = ['Area', 'Land class', 'Soil type']

target_col = 'SOC'

In [48]:
models = {
    'RandomForest': RandomForestRegressor(random_state=0),
    'XGB': XGBRegressor(random_state=0)
}

In [49]:


# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Train and test multiple models
for model_name, model in models.items():

    # Create pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
    ])
        
    # Train the pipeline
    pipeline.fit(X_train, y_train)
        
    # Predict on the test set
    y_pred = pipeline.predict(X_test)

    # Evaluate and display results
    r2 = r2_score(y_test, y_pred)
    print(f"R² for {model_name}: {r2}")
    print("-" * 30)

        


2024/12/25 11:20:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '53910fd4a1564bb189526704c5b2e728', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run sedate-kite-138 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/53910fd4a1564bb189526704c5b2e728
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
R² for RandomForest: 0.7986670251808461
------------------------------


2024/12/25 11:20:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'bf9c58f6233c4f908965e6e035d7d7b2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run masked-snipe-932 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/bf9c58f6233c4f908965e6e035d7d7b2
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
R² for XGB: 0.7707157252058728
------------------------------


In [52]:
# Upload using the DagsHub client, to a DVC tracked folder also called "data".
# Follow the instructions that appear to authorize the request.
from dagshub import upload_files
upload_files('Omdena/IPage', 'merged_v3.csv')

In [62]:
data = pd.read_csv('merged_v3.csv')
data.drop(columns=['longitude', 'latitude', 'Soil group', 'Boron', 'Zinc'], inplace=True)

# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay']

one_hot_cols = ['Land class', 'Soil type']

target_mean_col = 'Area'

target_col = 'SOC'

In [63]:
models = {
    'RandomForest': RandomForestRegressor(random_state=0),
    'XGB': XGBRegressor(random_state=0)
}

In [64]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.target_means_ = {}
    
    def fit(self, X, y):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        self.target_means_ = {
            col: X[col].map(y.groupby(X[col]).mean())
            for col in X.columns
        }
        return self
    
    def transform(self, X):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        X_encoded = X.copy()
        for col in X_encoded.columns:
            X_encoded[col] = X_encoded[col].map(self.target_means_[col])
        return X_encoded.fillna(0)  # Handle unseen categories with 0

In [65]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Train and test multiple models
for model_name, model in models.items():

    # Create pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
    ])
        
    # Cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    print(f"Cross-validated R² scores: {scores}")
    print(f"Mean R²: {scores.mean()}")
    
    # Fit and test the pipeline
    pipeline.fit(X_train, y_train)
    print(f"Test R²: {pipeline.score(X_test, y_test)}")



Cross-validated R² scores: [0.78668703 0.83122737 0.82826091 0.74987786 0.78477628]
Mean R²: 0.7961658906150132


2024/12/27 13:00:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4c85aad1c3184a7eb63f866bbdd6e031', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run shivering-hawk-593 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/4c85aad1c3184a7eb63f866bbdd6e031
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
Test R²: 0.7943046387408215
Cross-validated R² scores: [0.79033322 0.79552752 0.80648591 0.75073391 0.7899255 ]
Mean R²: 0.78660121040299


2024/12/27 13:01:00 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2a9a6815999349858e3b1164ba7fd2ca', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run rogue-shad-504 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/2a9a6815999349858e3b1164ba7fd2ca
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
Test R²: 0.7754541029219147


In [58]:
data = pd.read_csv('merged_v3.csv')

data.drop(columns=['longitude', 'latitude', 'Soil group', 'Boron', 'Zinc'], inplace=True)

# Add engineered features to the DataFrame
data['Nitrogen / Sand'] = data['Nitrogen'] / data['Sand']
data['Nitrogen / pH'] = data['Nitrogen'] / data['pH']
data['Clay * Nitrogen'] = data['Clay'] * data['Nitrogen']
data['Nitrogen * Silt'] = data['Nitrogen'] * data['Silt']
data['Nitrogen * Sulfur'] = data['Nitrogen'] * data['Sulfur']
data['Nitrogen * pH'] = data['Nitrogen'] * data['pH']
data['NATURAL_LOGARITHM(Nitrogen)'] = np.log(data['Nitrogen'])
data['NATURAL_LOGARITHM(Sand)'] = np.log(data['Sand'])
data['SQUARE_ROOT(Nitrogen)'] = np.sqrt(data['Nitrogen'])

# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay', 'Nitrogen / Sand',
                 'Nitrogen / pH', 'Clay * Nitrogen', 'Nitrogen * Silt',
                 'Nitrogen * Sulfur', 'Nitrogen * pH', 'NATURAL_LOGARITHM(Nitrogen)',
                 'NATURAL_LOGARITHM(Sand)', 'SQUARE_ROOT(Nitrogen)']

one_hot_cols = ['Land class', 'Soil type']

target_mean_col = 'Area'

target_col = 'SOC'

In [59]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Train and test multiple models
for model_name, model in models.items():

    # Create pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
    ])
        
    # Cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    print(f"Cross-validated R² scores: {scores}")
    print(f"Mean R²: {scores.mean()}")
    
    # Fit and test the pipeline
    pipeline.fit(X_train, y_train)
    print(f"Test R²: {pipeline.score(X_test, y_test)}")



Cross-validated R² scores: [0.7934679  0.82033217 0.83392184 0.77212799 0.78421204]
Mean R²: 0.8008123893847892


2024/12/27 12:50:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'eb3bd0b7d99240c6989f29e3b9b9bcc2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run inquisitive-vole-672 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/eb3bd0b7d99240c6989f29e3b9b9bcc2
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
Test R²: 0.78801734036029
Cross-validated R² scores: [0.80340794 0.80903379 0.78719128 0.74579877 0.77064191]
Mean R²: 0.7832147383425022


2024/12/27 12:50:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8a903742c3594887a71bbd8668199d26', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run glamorous-shark-86 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/8a903742c3594887a71bbd8668199d26
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
Test R²: 0.7632420340507827


In [68]:
data = pd.read_csv('merged_v3.csv')

data.drop(columns=['longitude', 'latitude', 'Soil group', 'Boron', 'Zinc'], inplace=True)

# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay']

one_hot_cols = ['Land class', 'Soil type']

target_mean_col = 'Area'

target_col = 'SOC'

In [74]:
# Define hyperparameter distributions for both models
param_dist_rf = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': [None, 'sqrt', 'log2'],
    'regressor__bootstrap': [True, False]
}

param_dist_xgb = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 6, 9],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.7, 0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 0.8, 1.0],
    'regressor__gamma': [0, 0.1, 0.5],
    'regressor__reg_alpha': [0, 1, 5],
    'regressor__reg_lambda': [1, 5, 10]
}

# Set up the models
models = {
    'RandomForest': (RandomForestRegressor(random_state=0), param_dist_rf),
    'XGB': (XGBRegressor(random_state=0), param_dist_xgb)
}

In [75]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Perform RandomizedSearchCV
best_params = {}
results = {}
# Perform RandomizedSearchCV
for name, (regressor, param_dist) in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # Preprocessing step
        ('regressor', regressor)        # Regressor
    ])
    
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=50,
        scoring=make_scorer(r2_score),
        cv=5,
        n_jobs=-1,
        random_state=0
    )
    
    random_search.fit(X_train, y_train)
    best_params[name] = random_search.best_params_
    results[name] = random_search.cv_results_
    
    print(f"Best parameters for {name}: {random_search.best_params_}")
    print(f"Best score for {name}: {random_search.best_score_}")

2024/12/27 13:18:13 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd81969aa63e046f6a350a582e6cb4cb0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/12/27 13:19:01 INFO mlflow.sklearn.utils: Logging the 5 best runs, 45 runs will be omitted.


🏃 View run fortunate-ram-184 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/1f4788479c6f4576940f32fdf6999dc4
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run sneaky-asp-645 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/f5f8296d84c64c9689c6d2ee639226b7
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run charming-koi-508 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/249b14842a404473896066822993f4f4
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run worried-auk-291 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/f6e9f32ad6db4f148764934dfab6a74e
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run intrigued-snake-63 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/e157837a553d48d4858bb865c07f5f8f
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlf

2024/12/27 13:19:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5a4ecdf5f2ee48d08cab0073b48d6786', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/12/27 13:19:35 INFO mlflow.sklearn.utils: Logging the 5 best runs, 45 runs will be omitted.


🏃 View run tasteful-mouse-495 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/05f776070eee4c17a46e811857b1cacc
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run popular-lark-931 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/39a7222df7d74691b824b54868b657ab
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run auspicious-hound-546 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/f8044f3ad347463db31cf15e4ccc6033
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run sedate-koi-144 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/76eb8927bd5e4fe58bbf817539d97693
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run amazing-fawn-352 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/688bea12cf8149e29be46bcbcf3392fa
🧪 View experiment at: https://dagshub.com/Omdena/IPage

In [85]:
df = pd.read_csv('merged_v3.csv')

data = df[
    (df['Boron'] < 0.58) &
    (df['Zinc'] < 2.8) &
    (df['SOC'] < 3.0) &
    (df['Sulfur'] < 30.0) &
    (df['Nitrogen'] < 0.2) &
    (df['Potassium'] < 0.8) &
    (df['Phosphorus'] < 40.0)
].copy()

data.drop(columns=['longitude', 'latitude', 'Soil group', 'Boron', 'Zinc'], inplace=True)

# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay']

one_hot_cols = ['Land class', 'Soil type']

target_mean_col = 'Area'

target_col = 'SOC'

In [86]:
# Define hyperparameter distributions for both models
param_dist_rf = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': [None, 'sqrt', 'log2'],
    'regressor__bootstrap': [True, False]
}

param_dist_xgb = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 6, 9],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.7, 0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 0.8, 1.0],
    'regressor__gamma': [0, 0.1, 0.5],
    'regressor__reg_alpha': [0, 1, 5],
    'regressor__reg_lambda': [1, 5, 10]
}

# Set up the models
models = {
    'RandomForest': (RandomForestRegressor(random_state=0), param_dist_rf),
    'XGB': (XGBRegressor(random_state=0), param_dist_xgb)
}

In [87]:
# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_cols),
        # Target mean encode high cardinality feature
        ('target_mean_enc', TargetMeanEncoder(), target_mean_col),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Perform RandomizedSearchCV
best_params = {}
results = {}
# Perform RandomizedSearchCV
for name, (regressor, param_dist) in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # Preprocessing step
        ('regressor', regressor)        # Regressor
    ])
    
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=50,
        scoring=make_scorer(r2_score),
        cv=5,
        n_jobs=-1,
        random_state=0
    )
    
    random_search.fit(X_train, y_train)
    best_params[name] = random_search.best_params_
    results[name] = random_search.cv_results_
    
    print(f"Best parameters for {name}: {random_search.best_params_}")
    print(f"Best score for {name}: {random_search.best_score_}")

2024/12/27 14:25:39 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '55ae8f9657ee4ff39fc0ebf5c7d5e394', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/12/27 14:26:23 INFO mlflow.sklearn.utils: Logging the 5 best runs, 45 runs will be omitted.


🏃 View run delightful-stag-343 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/d9f565244f9b482d8eca755058b8c785
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run sedate-crow-958 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/c47d244b8ef9423a8da66659e4fd510b
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run rogue-goat-891 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/b3a65b44963b42a59f9f10f5c0215b73
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run youthful-donkey-29 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/5aa2c66f9bea49be9121f87b06c61175
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run abrasive-shad-731 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/564e2ed2dac845d1a157328858217fca
🧪 View experiment at: https://dagshub.com/Omdena/IPage.

2024/12/27 14:26:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '34d1d831c3a94d6bafa0e9ea97d6c29e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/12/27 14:26:58 INFO mlflow.sklearn.utils: Logging the 5 best runs, 45 runs will be omitted.


🏃 View run learned-bass-971 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/02d70f0cbe3d4ff88cc023ae9425cb53
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run stylish-mink-362 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/6b8cd5a08f294b3dbb720bdb78c78b5e
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run selective-foal-216 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/2ab387de517246dd9956f3ea1c049092
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run painted-gull-365 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/4f3b6d45a71c4e8c8f44f301dd8f355d
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
🏃 View run invincible-bird-883 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/94fedc4ba7434f9d91d5eca192df0446
🧪 View experiment at: https://dagshub.com/Omdena/IPag

In [94]:
df = pd.read_csv('merged_v3.csv')

data = df[
    (df['Boron'] < 0.58) &
    (df['Zinc'] < 2.8) &
    (df['SOC'] < 3.0) &
    (df['Sulfur'] < 30.0) &
    (df['Nitrogen'] < 0.2) &
    (df['Potassium'] < 0.8) &
    (df['Phosphorus'] < 40.0)
].copy()

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2438 entries, 2 to 2583
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   longitude   2438 non-null   float64
 1   latitude    2438 non-null   float64
 2   Area        2438 non-null   object 
 3   Soil group  2438 non-null   object 
 4   Land class  2438 non-null   object 
 5   Soil type   2438 non-null   object 
 6   pH          2438 non-null   float64
 7   SOC         2438 non-null   float64
 8   Nitrogen    2438 non-null   float64
 9   Potassium   2438 non-null   float64
 10  Phosphorus  2438 non-null   float64
 11  Sulfur      2438 non-null   float64
 12  Boron       2438 non-null   float64
 13  Zinc        2438 non-null   float64
 14  Sand        2438 non-null   float64
 15  Silt        2438 non-null   float64
 16  Clay        2438 non-null   float64
dtypes: float64(13), object(4)
memory usage: 342.8+ KB


In [95]:
data.to_csv('merged_v4.csv', index=False)
upload_files('Omdena/IPage', 'merged_v4.csv')