In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
file_path = "combined_sr.xlsx"  # Update this with your file path
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'Wetland Type - Provincial Class',
    'Wetland Type - Federal Class',
    'Water Regime Indicator',
    'Specific Vegetation Type',
    '% Vegetation Cover for Specific Vegetation Cover Types',
    '% High Woody Canopy Cover (>5m)',
    'Phragmites present (Y/N)',
    'Soil Type',
    '% of Surface Water Present',
    'Depth of Saturation (cm)',
    'Average Depth of Living Moss (cm)',
    'Average Total Depth of Organics (moss, organics, muck) (cm)',
    'Average Organic Depth (cm)',
    'Hydrogeomorphic Class',
    '% Moss Cover'
]

results_columns = ['SR', 'SR_Benefit']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]  # Assuming you want to predict 'SR'

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)


Mean Squared Error: 4.073671243628317
R-squared: 0.4444430330258762
Coefficients: [ 0.61381577  0.17966367 -0.42590187  0.01726775 -0.66533315 -0.07166257
 -0.42198033 -0.07495849 -0.30987947 -0.08966464  1.62517037 -0.05257816
  0.28025074  0.19613973  0.21807346]
Intercept: 3.155414079055224


In [5]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]  # Assuming you want to predict 'SR'

# Initialize the model
model = LinearRegression()

# Define custom scoring function for cross-validation
def custom_scorer(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return -mse  # return negative MSE to maximize it

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring=make_scorer(custom_scorer))

# Convert MSE scores to positive for clarity
mse_scores = -cv_scores

# Print cross-validation results
print("Cross-validation MSE scores:", mse_scores)
print("Mean MSE:", mse_scores.mean())
print("Std MSE:", mse_scores.std())

Cross-validation MSE scores: [10.1232492   7.25818352  7.21094672  7.76624577  6.64380149]
Mean MSE: 7.800485339796525
Std MSE: 1.2145755562001035


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer

# Load the data
file_path = "combined_sr.xlsx"  # Update this with your file path
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'Wetland Type - Provincial Class',
    'Wetland Type - Federal Class',
    'Water Regime Indicator',
    'Specific Vegetation Type',
    '% Vegetation Cover for Specific Vegetation Cover Types',
    '% High Woody Canopy Cover (>5m)',
    'Phragmites present (Y/N)',
    'Soil Type',
    '% of Surface Water Present',
    'Depth of Saturation (cm)',
    'Average Depth of Living Moss (cm)',
    'Average Total Depth of Organics (moss, organics, muck) (cm)',
    'Average Organic Depth (cm)',
    'Hydrogeomorphic Class',
    '% Moss Cover'
]

results_columns = ['SR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]  # Assuming you want to predict 'SR'

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define custom scoring function
def custom_scorer(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, r2

# Define models
models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    DecisionTreeRegressor(),
    RandomForestRegressor()
]

# Train and make predictions using each model
for model in models:
    pipeline = make_pipeline(StandardScaler(), model)  # Scale features for some models
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print(f"Model: {model.__class__.__name__}")
    
    # Show real and predicted results for the first 5 samples
    for i in range(5):
        print(f"Sample {i+1}: Real SR = {y_test.iloc[i]}, Predicted SR = {y_pred[i]}")
    
    print("\n")


Model: LinearRegression
Sample 1: Real SR = 3.14, Predicted SR = 4.884658561478597
Sample 2: Real SR = 1.95, Predicted SR = 2.4321959826689517
Sample 3: Real SR = 1.071415011787847, Predicted SR = 2.7277513026211766
Sample 4: Real SR = 3.073673531374684, Predicted SR = 4.734755937089597
Sample 5: Real SR = 1.55, Predicted SR = 4.556552931983098


Model: Ridge
Sample 1: Real SR = 3.14, Predicted SR = 4.886618799911994
Sample 2: Real SR = 1.95, Predicted SR = 2.432774872830897
Sample 3: Real SR = 1.071415011787847, Predicted SR = 2.72497990132601
Sample 4: Real SR = 3.073673531374684, Predicted SR = 4.705534557114696
Sample 5: Real SR = 1.55, Predicted SR = 4.554366071870147


Model: Lasso
Sample 1: Real SR = 3.14, Predicted SR = 4.642516708804058
Sample 2: Real SR = 1.95, Predicted SR = 3.870727687214094
Sample 3: Real SR = 1.071415011787847, Predicted SR = 5.02841121959904
Sample 4: Real SR = 3.073673531374684, Predicted SR = 4.641546400656385
Sample 5: Real SR = 1.55, Predicted SR = 4

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

# Load the data
file_path = "combined_sr.xlsx"  # Update this with your file path
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'Wetland Type - Provincial Class',
    'Wetland Type - Federal Class',
    'Water Regime Indicator',
    'Specific Vegetation Type',
    '% Vegetation Cover for Specific Vegetation Cover Types',
    '% High Woody Canopy Cover (>5m)',
    'Phragmites present (Y/N)',
    'Soil Type',
    '% of Surface Water Present',
    'Depth of Saturation (cm)',
    'Average Depth of Living Moss (cm)',
    'Average Total Depth of Organics (moss, organics, muck) (cm)',
    'Average Organic Depth (cm)',
    'Hydrogeomorphic Class',
    '% Moss Cover'
]

results_columns = ['SR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]  # Assuming you want to predict 'SR'

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    AdaBoostRegressor(),
    SVR(),
    KNeighborsRegressor()
]

# Define hyperparameters to search for each model
param_grid = {
    'Ridge': {'ridge__alpha': [0.1, 0.5, 1.0]},
    'GradientBoostingRegressor': {},
    'AdaBoostRegressor': {},
    'SVR': {},
    'KNeighborsRegressor': {}
}

# Train and tune hyperparameters for each model
best_models = {}

for model in models:
    model_name = model.__class__.__name__
    pipeline = make_pipeline(StandardScaler(), model)  # Scale features for some models
    
    # Perform grid search for hyperparameters
    if model_name in param_grid:
        grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best hyperparameters for {model_name}: {grid_search.best_params_}")
    else:
        pipeline.fit(X_train, y_train)
        best_models[model_name] = pipeline

# Make predictions using the best models
for model_name, model in best_models.items():
    print(f"Model: {model_name}")
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Show real and predicted results for the first 5 samples
    for i in range(5):
        print(f"Sample {i+1}: Real SR = {y_test.iloc[i]}, Predicted SR = {y_pred[i]}")
    
    # Calculate and print RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE: {rmse}")
    
    print("\n")


Best hyperparameters for Ridge: {'ridge__alpha': 1.0}
Best hyperparameters for GradientBoostingRegressor: {}
Best hyperparameters for AdaBoostRegressor: {}
Best hyperparameters for SVR: {}
Best hyperparameters for KNeighborsRegressor: {}
Model: Ridge
Sample 1: Real SR = 3.14, Predicted SR = 4.886618799911994
Sample 2: Real SR = 1.95, Predicted SR = 2.432774872830897
Sample 3: Real SR = 1.071415011787847, Predicted SR = 2.72497990132601
Sample 4: Real SR = 3.073673531374684, Predicted SR = 4.705534557114696
Sample 5: Real SR = 1.55, Predicted SR = 4.554366071870147
RMSE: 2.0174905910083494


Model: DecisionTreeRegressor
Sample 1: Real SR = 3.14, Predicted SR = 2.31
Sample 2: Real SR = 1.95, Predicted SR = 1.74
Sample 3: Real SR = 1.071415011787847, Predicted SR = 3.04
Sample 4: Real SR = 3.073673531374684, Predicted SR = 10.0
Sample 5: Real SR = 1.55, Predicted SR = 2.12
RMSE: 3.0466542634224374


Model: RandomForestRegressor
Sample 1: Real SR = 3.14, Predicted SR = 3.599623261697104
Sa

