## Implementation of LASSO for Boron Prediction

#### Import necessary libraries

In [1]:
from pathlib import Path
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, PowerTransformer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

import joblib
import mlflow
import mlflow.sklearn
# Feature selection
from sklearn.inspection import permutation_importance

import warnings
warnings.filterwarnings("ignore")

from sklearn.exceptions import ConvergenceWarning
# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
#pwd

#### Load the dataset

In [3]:
file_path = "../../../data/merged_v2.csv"
# Create a Path object
data_file_path = Path(file_path)
data = pd.read_csv(data_file_path)

#### Hyperparameters

In [4]:
N_CLUSTERS = 5
CV = 15
MAX_ITER = 2000
ALPHAS = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
LASSO_MAX_ITER = [2000, 3000, 4000]
N_REPEATS = 10
POWER_TRANSFORMER_METHOD = "yeo-johnson"
GridSearchCV_SCORING = "neg_mean_absolute_error"
# GridSearchCV_SCORING = "neg_root_mean_squared_error"

#### Create geo clusters if longitude and latitude are present

In [5]:
if 'longitude' in data.columns and 'latitude' in data.columns:
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42)
    data['geo_cluster'] = kmeans.fit_predict(data[['longitude', 'latitude']])
    print("Geo clusters created and added to the dataset.")

Geo clusters created and added to the dataset.


#### Define target variable and features

In [6]:
target = 'Boron'
features = [col for col in data.columns if col not in ['SOC', 'Zinc', target]]

In [7]:
print(features)

['longitude', 'latitude', 'Area', 'Soil group', 'Land class', 'Soil type', 'pH', 'Nitrogen', 'Potassium', 'Phosphorus', 'Sulfur', 'Sand', 'Silt', 'Clay', 'geo_cluster']


#### Ensure geo_cluster is included and drop raw longitude and latitude if present

In [8]:
if 'longitude' in features and 'latitude' in features:
    features.remove('longitude')
    features.remove('latitude')
if 'geo_cluster' not in features and 'geo_cluster' in data.columns:
    features.append('geo_cluster')

#### Identify categorical and numerical features

In [9]:
categorical_features = data[features].select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = data[features].select_dtypes(include=['number']).columns.tolist()

#### Feature engineering: Add polynomial features for numerical data

In [10]:
polynomial_features = []
interaction_features = []
for feature in numerical_features:
    data[f'{feature}_squared'] = data[feature] ** 2
    polynomial_features.append(f'{feature}_squared')

for i in range(len(numerical_features)):
    for j in range(i + 1, len(numerical_features)):
        interaction_term = f'{numerical_features[i]}_x_{numerical_features[j]}'
        data[interaction_term] = data[numerical_features[i]] * data[numerical_features[j]]
        interaction_features.append(interaction_term)

#### Update numerical features with new polynomial features

In [11]:
features += polynomial_features + interaction_features

#### Select the feature columns for prediction

In [12]:
X = data[features]
y = data[target]

In [13]:
print(X.columns)
print(len(X.columns))

Index(['Area', 'Soil group', 'Land class', 'Soil type', 'pH', 'Nitrogen',
       'Potassium', 'Phosphorus', 'Sulfur', 'Sand', 'Silt', 'Clay',
       'geo_cluster', 'pH_squared', 'Nitrogen_squared', 'Potassium_squared',
       'Phosphorus_squared', 'Sulfur_squared', 'Sand_squared', 'Silt_squared',
       'Clay_squared', 'geo_cluster_squared', 'pH_x_Nitrogen',
       'pH_x_Potassium', 'pH_x_Phosphorus', 'pH_x_Sulfur', 'pH_x_Sand',
       'pH_x_Silt', 'pH_x_Clay', 'pH_x_geo_cluster', 'Nitrogen_x_Potassium',
       'Nitrogen_x_Phosphorus', 'Nitrogen_x_Sulfur', 'Nitrogen_x_Sand',
       'Nitrogen_x_Silt', 'Nitrogen_x_Clay', 'Nitrogen_x_geo_cluster',
       'Potassium_x_Phosphorus', 'Potassium_x_Sulfur', 'Potassium_x_Sand',
       'Potassium_x_Silt', 'Potassium_x_Clay', 'Potassium_x_geo_cluster',
       'Phosphorus_x_Sulfur', 'Phosphorus_x_Sand', 'Phosphorus_x_Silt',
       'Phosphorus_x_Clay', 'Phosphorus_x_geo_cluster', 'Sulfur_x_Sand',
       'Sulfur_x_Silt', 'Sulfur_x_Clay', 'Sulfur_x_ge

#### Split data into train and test sets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Define preprocessing for numerical and categorical features

In [15]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [16]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#### Combine preprocessing steps

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features + polynomial_features + interaction_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

#### Define a LASSO regression pipeline

In [18]:
lasso_pipeline_for_perm_imp = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso', LassoCV(cv=CV, random_state=42, max_iter=MAX_ITER, alphas=ALPHAS))
])

#### Train the pipeline

In [19]:
lasso_pipeline_for_perm_imp.fit(X_train, y_train)

#### Calculate feature importances

In [20]:
perm_importance = permutation_importance(lasso_pipeline_for_perm_imp, X_train, y_train, n_repeats=N_REPEATS, random_state=42)
importances = perm_importance.importances_mean

#### Sort the feature importances in descending order

In [21]:
feature_importances = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)
top_n_features = [feature for feature, score in feature_importances[:len(feature_importances)]]

In [22]:
print(f"Top N Features: {top_n_features}")
print(f"Length: {len(top_n_features)}")

Top N Features: ['pH_x_Potassium', 'Area', 'pH_x_Silt', 'pH_x_Nitrogen', 'pH_x_Sulfur', 'Potassium', 'Potassium_x_Sand', 'pH_squared', 'Silt_squared', 'Potassium_x_Clay', 'pH', 'Nitrogen_x_Sand', 'Silt', 'Soil group', 'Sand_x_Silt', 'Sulfur_x_Clay', 'Sulfur_x_Silt', 'Nitrogen_x_Silt', 'Phosphorus_x_Clay', 'Nitrogen_x_Phosphorus', 'Phosphorus_squared', 'pH_x_Clay', 'Nitrogen_x_Sulfur', 'Nitrogen_squared', 'Sand_x_geo_cluster', 'Land class', 'Potassium_x_geo_cluster', 'Sulfur_x_geo_cluster', 'Clay_squared', 'Potassium_squared', 'Nitrogen_x_Potassium', 'Sand_squared', 'Silt_x_geo_cluster', 'Potassium_x_Phosphorus', 'Phosphorus_x_geo_cluster', 'Nitrogen_x_geo_cluster', 'Phosphorus_x_Sulfur', 'Soil type', 'geo_cluster_squared', 'Potassium_x_Sulfur', 'Clay_x_geo_cluster', 'Potassium_x_Silt', 'Sand_x_Clay', 'Sulfur_squared', 'Nitrogen', 'Phosphorus', 'Sulfur', 'Sand', 'Clay', 'geo_cluster', 'pH_x_Phosphorus', 'pH_x_Sand', 'pH_x_geo_cluster', 'Nitrogen_x_Clay', 'Phosphorus_x_Sand', 'Phosphorus

#### Select features and update X data

In [23]:
X = data[top_n_features]
X_train, X_test = X.iloc[train_test_split(X.index, test_size=0.2, random_state=42)[0]], X.iloc[train_test_split(X.index, test_size=0.2, random_state=42)[1]]

In [24]:
print(f"Length: {len(X.columns)}")

Length: 58


#### Dynamically identify categorical and numerical features

In [25]:
categorical_features_top_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features_top_features = X_train.select_dtypes(include=['number']).columns.tolist()

#### Combine preprocessing steps

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_top_features),
        ('cat', categorical_transformer, categorical_features_top_features)
    ]
)

#### Define a pipeline with preprocessing and LASSO regression

In [27]:
lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso', LassoCV(cv=CV, random_state=42, max_iter=MAX_ITER, alphas=ALPHAS))
])

#### Define a grid of hyperparameters to search

In [28]:
param_grid = {
    'lasso__alphas': [ALPHAS],  # Use a nested list for LassoCV
    'lasso__max_iter': LASSO_MAX_ITER,
    'preprocessor__num__scaler':[StandardScaler(), PowerTransformer(method=POWER_TRANSFORMER_METHOD)],
}

#### Define GridSearchCV

In [29]:
grid_search = GridSearchCV(lasso_pipeline, param_grid, cv=CV, scoring=GridSearchCV_SCORING, verbose=1, n_jobs=-1)

#### Start MLflow experiment | Train the pipeline

In [30]:
mlflow.set_experiment("Boron Prediction with LASSO")

with mlflow.start_run():
    # Train the model with hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Get the best estimator and its parameters
    lasso_pipeline = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Log the best parameters
    mlflow.log_params(best_params)

    # Print the best hyperparameters
    print("Best Hyperparameters:")
    for param, value in best_params.items():
        print(f"{param}: {value}")

    # Log the best score
    mlflow.log_metric("best_cv_score", grid_search.best_score_)

    # Log model
    mlflow.sklearn.log_model(lasso_pipeline, "lasso_pipeline")

    # Log parameters
    mlflow.log_param("cv", CV)
    mlflow.log_param("n_clusters", N_CLUSTERS)
    mlflow.log_param("alphas", lasso_pipeline.named_steps['lasso'].alphas_)
    mlflow.log_param("max_iter", lasso_pipeline.named_steps['lasso'].max_iter)
    # mlflow.log_param("selected_features", top_n_features)

    # Make predictions using the trained model
    predictions = lasso_pipeline.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Log metrics
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)

    print("LASSO Regression Results for Boron Prediction:")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-Squared (R²): {r2:.4f}")

    # Save the best model in MLflow
    if "best_model" not in mlflow.active_run().data.tags:
        mlflow.set_tag("best_model", True)


Fitting 15 folds for each of 6 candidates, totalling 90 fits
Best Hyperparameters:
lasso__alphas: [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
lasso__max_iter: 2000
preprocessor__num__scaler: StandardScaler()




LASSO Regression Results for Boron Prediction:
Mean Absolute Error (MAE): 0.0450
Mean Squared Error (MSE): 0.0062
R-Squared (R²): 0.6826


#### Save the trained model

In [31]:
model_path = Path('lasso_boron_pipeline.pkl')
joblib.dump(lasso_pipeline, model_path)
print(f"Model saved to {model_path}")

Model saved to lasso_boron_pipeline.pkl


In [32]:
#### Load the model
# loaded_model = joblib.load(model_path)
# print("Model loaded successfully.")