In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge

from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor









# Load the data
file_path = "data_ra_norm_filled_all.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'Wetland Type - Provincial Class',
    'Wetland Type - Federal Class',
    'Water Regime Indicator',
    'Specific Vegetation Type',
    '% Vegetation Cover for Specific Vegetation Cover Types',
    '% High Woody Canopy Cover (>5m)',
    'Phragmites present (Y/N)',
    'Soil Type',
    '% of Surface Water Present',
    'Depth of Saturation (cm)',
    'Average Depth of Living Moss (cm)',
    'Average Total Depth of Organics',
    'Average Organic Depth (cm)',
    'Hydrogeomorphic Class',
    '% Moss Cover'
]

results_columns = ['NR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [2]:
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2 , 5,],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1],
        'gradientboostingregressor__n_estimators': [25, 50, 100,250],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100),],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [ 0.25, 0.5, 0.75,],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],

    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [ 1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [ 1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000, 10000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}


In [6]:
models = [Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(),AdaBoostRegressor(), KNeighborsRegressor(), MLPRegressor(max_iter=200),ElasticNet(max_iter=1000),SGDRegressor(max_iter=1000),SVR(cache_size=1000),BayesianRidge(max_iter=1000),KernelRidge(),LinearRegression(), RANSACRegressor(), TheilSenRegressor()]

import warnings
warnings.filterwarnings("ignore")


In [7]:


# Train and tune hyperparameters for each model
best_models = {}

for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
    print(model)
    if model == 'TensorFlow':
        # Define the TensorFlow model
        model_tf = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        # Compile the TensorFlow model
        model_tf.compile(optimizer='adam', loss='mean_squared_error')

        # Standardize the data for TensorFlow model
        scaler_tf = StandardScaler()
        X_train_scaled_tf = scaler_tf.fit_transform(X_train)
        X_test_scaled_tf = scaler_tf.transform(X_test)

        # Train the TensorFlow model
        model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2)

        # Evaluate the TensorFlow model
        y_pred_tf = model_tf.predict(X_test_scaled_tf)
        rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
        print(f"TensorFlow RMSE: {rmse_tf}")

        # Add TensorFlow model to best_models
        best_models['TensorFlow'] = model_tf
    else:
        model_name = model.__class__.__name__
        pipeline = make_pipeline(StandardScaler(), model)
        # Perform grid search for hyperparameters
        if model_name in param_grid:
            grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)
            best_models[model_name] = grid_search.best_estimator_
            print(f"Best hyperparameters for {model_name}: {grid_search.best_params_}")
        else:
            pipeline.fit(X_train, y_train)
            best_models[model_name] = pipeline



Ridge()
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'lsqr'}
DecisionTreeRegressor()
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 3, 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'best'}
GradientBoostingRegressor()
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.1, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': False}
RandomForestRegressor()
Best hyperparameters for RandomForestRegressor: {'randomforestregressor__criterion': 'absolute_error', 'randomforestregressor__max_features': 'log2', 'randomforestregressor__min_samples_split': 5, 'randomforestregressor__n_estimators': 100}
AdaBoostRegressor()
Best hyperparameters for AdaBoostRegressor: {'adaboostregressor__learning_rate': 10, 'adaboostregresso

In [13]:
scaler_filename = "scaler_tf.pkl"
joblib.dump(scaler_tf, scaler_filename)

['scaler_tf.pkl']

In [12]:
import os
import tensorflow as tf

# Directory where you want to save your models
model_directory = "saved_models"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Make predictions using the best models
for model_name, model in best_models.items():
    print(f"Model: {model_name}")

    # Make predictions
    if model_name == 'TensorFlow':
        y_pred = y_pred_tf  # Use predictions from TensorFlow model
        # Save the TensorFlow model
        model_filename = os.path.join(model_directory, f"{model_name}_model.h5")
        model.save(model_filename)
        print(f"Model saved as {model_filename}")
    else:
        y_pred = model.predict(X_test)

        # Save the other models using joblib
        model_filename = os.path.join(model_directory, f"{model_name}_model.pkl")
        joblib.dump(model, model_filename)
        print(f"Model saved as {model_filename}")

    # Calculate and print RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE: {rmse}")

    # Show real and predicted results for the first 5 samples
    print("Sample predictions:")
    for i in range(5):
        print(f"Sample {i+1}: Real NR = {y_test.iloc[i]}, Predicted NR = {y_pred[i]}")

    print("\n")




Model: Ridge
Model saved as saved_models\Ridge_model.pkl
RMSE: 3.0212481768005413
Sample predictions:
Sample 1: Real NR = 10.0, Predicted NR = 8.535104961531776
Sample 2: Real NR = 3.0, Predicted NR = 2.963133334286255
Sample 3: Real NR = 0.27, Predicted NR = 5.549521374411243
Sample 4: Real NR = 10.0, Predicted NR = 4.822872683206046
Sample 5: Real NR = 1.65, Predicted NR = 5.575825322223221


Model: DecisionTreeRegressor
Model saved as saved_models\DecisionTreeRegressor_model.pkl
RMSE: 4.066867781392477
Sample predictions:
Sample 1: Real NR = 10.0, Predicted NR = 2.687049731370855
Sample 2: Real NR = 3.0, Predicted NR = 2.12
Sample 3: Real NR = 0.27, Predicted NR = 1.6864854192406071
Sample 4: Real NR = 10.0, Predicted NR = 2.1233333333333335
Sample 5: Real NR = 1.65, Predicted NR = 0.43333333333333335


Model: GradientBoostingRegressor
Model saved as saved_models\GradientBoostingRegressor_model.pkl
RMSE: 2.8021263744102036
Sample predictions:
Sample 1: Real NR = 10.0, Predicted NR =