In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import matplotlib.pyplot as plt
import os

# Load the data
file_path = "data_ra_norm_filled.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'Wetland Type - Provincial Class',
    'Wetland Type - Federal Class',
    'Water Regime Indicator',
    'Specific Vegetation Type',
    '% Vegetation Cover for Specific Vegetation Cover Types',
    '% High Woody Canopy Cover (>5m)',
    'Phragmites present (Y/N)',
    'Soil Type',
    '% of Surface Water Present',
    'Depth of Saturation (cm)',
    'Average Depth of Living Moss (cm)',
    'Average Total Depth of Organics',
    'Average Organic Depth (cm)',
    'Hydrogeomorphic Class',
    '% Moss Cover'
]

results_columns = ['WS']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(),AdaBoostRegressor(), KNeighborsRegressor(), MLPRegressor(max_iter=200),ElasticNet(max_iter=1000),SGDRegressor(max_iter=1000),SVR(cache_size=1000),BayesianRidge(max_iter=1000),KernelRidge(),LinearRegression(), RANSACRegressor(), TheilSenRegressor()]


# Define hyperparameters to search for each model
param_grid = {
    'Ridge': {'ridge__alpha': [0.1, 0.5, 1.0]},
    'MLPRegressor': {}
}

# Train and tune hyperparameters for each model
best_models = {}

for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
    if model == 'TensorFlow':
        # Define the TensorFlow model
        model_tf = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        # Compile the TensorFlow model
        model_tf.compile(optimizer='adam', loss='mean_squared_error')

        # Standardize the data for TensorFlow model
        scaler_tf = StandardScaler()
        X_train_scaled_tf = scaler_tf.fit_transform(X_train)
        X_test_scaled_tf = scaler_tf.transform(X_test)

        # Train the TensorFlow model
        model_tf.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

        # Evaluate the TensorFlow model
        y_pred_tf = model_tf.predict(X_test)
        rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
        print(f"TensorFlow RMSE: {rmse_tf}")

        # Add TensorFlow model to best_models
        best_models['TensorFlow'] = (model_tf, y_pred_tf)
    else:
        model_name = model.__class__.__name__
        pipeline = make_pipeline( model)

        # Perform grid search for hyperparameters
        if model_name in param_grid:
            grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)
            best_models[model_name] = (grid_search.best_estimator_, grid_search.predict(X_test))
            print(f"Best hyperparameters for {model_name}: {grid_search.best_params_}")
        else:
            pipeline.fit(X_train, y_train)
            best_models[model_name] = (pipeline, pipeline.predict(X_test))

# Plot expected vs. predicted results for each model separately and save
output_directory = "model_plots"
os.makedirs(output_directory, exist_ok=True)

for model_name, (model, y_pred) in best_models.items():
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, color='blue', s=5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'Expected vs. Predicted Results for {model_name}')
    plt.savefig(os.path.join(output_directory, f"{model_name}_"+(results_columns[0])+"_plot.png"))
    plt.close()


Best hyperparameters for Ridge: {'ridge__alpha': 1.0}




Best hyperparameters for MLPRegressor: {}
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - loss: 25.4724 - val_loss: 15.4692
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 14.7191 - val_loss: 14.6754
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 14.4420 - val_loss: 13.9996
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 13.4583 - val_loss: 13.3702
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 12.5411 - val_loss: 12.6978
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 11.7582 - val_loss: 11.2170
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 10.0537 - val_loss: 11.4431
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 11.3247 - val_loss: 10.4829
Epoch 9/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

