In [None]:
import pandas as pd
import joblib
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, VarianceThreshold
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import os
import ast
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.kernel_ridge import KernelRidge

# Define the data columns
data_columns = ['Provincial_Class','Federal_Class','Regime','Vegetation_Type','Vegetation_Cover','Woody_Canopy_Cover','Moss_Cover','Phragmites','Soil_Type',
'Surface_Water_Present','Saturation_Depth','Living_Moss_Depth','Organic_Depth','Hydrogeomorphic_Class',
    'OF2', 'OF3', 'OF4', 'OF5', 'OF6', 'OF7', 'OF8', 'OF9', 'OF10', 'OF11', 'OF13', 'OF14', 'OF15', 'OF16', 'OF17',
    'OF18', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24', 'OF25', 'OF26', 'OF27', 'OF28',  'OF30', 'OF31',
    'OF33', 'OF34', 'OF37', 'OF38', 'F1', 'F2', 'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F4', 'F5', 'F6',
    'F7', 'F8', 'F9', 'F10', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22', 'F23',
    'F24', 'F25',  'F28', 'F29', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'F36', 'F37', 'F38', 'F39', 'F40',
    'F41',  'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50', 'F51', 'F52', 'F53', 'F54', 'F55', 'F56', 'F57',
    'F58', 'F59', 'F62', 'F63', 'F64', 'F65', 'F67', 'F68', 'S1', 'S2', 'S4', 'S5'
]

feature_selection_techniques = {
    "SelectKBest_f_regression": SelectKBest(score_func=f_regression),
    "SelectKBest_mutual_info_regression": SelectKBest(score_func=mutual_info_regression),
    "VarianceThreshold": VarianceThreshold(threshold=0.1),  # Example threshold, adjust as needed
}

# Define a mapping from model names to model classes
model_mapping = {
    'Ridge': Ridge,
    'DecisionTreeRegressor': DecisionTreeRegressor,
    'GradientBoostingRegressor': GradientBoostingRegressor,
    'RandomForestRegressor': RandomForestRegressor,
    'AdaBoostRegressor': AdaBoostRegressor,
    'KNeighborsRegressor': KNeighborsRegressor,
    'MLPRegressor': MLPRegressor,
    'ElasticNet': ElasticNet,
    'SGDRegressor': SGDRegressor,
    'SVR': SVR,
    'BayesianRidge': BayesianRidge,
    'KernelRidge': KernelRidge,
    'LinearRegression': LinearRegression,
    'RANSACRegressor': RANSACRegressor,
    'TheilSenRegressor': TheilSenRegressor
}

RND=42 
# Function to load, evaluate, and retrain the best model with feature selection
def load_evaluate_and_retrain_best_model_with_feature_selection(csv_file, model_name, model_path, hyperparameters, results_column):
    # Load data from CSV
    data = pd.read_csv(csv_file)
    data = data.sort_values(by='id').reset_index(drop=True)
    data = data.drop(columns=['id'])
    #print(data)
    X = data[data_columns]
    y = data[results_column]

    # Split the data into training and testing sets with a fixed random state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=RND)
    #print(X_test)
    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Load the original model and calculate MSE
    if model_name == 'TensorFlow':
        model = tf.keras.models.load_model(model_path)
        y_pred = model.predict(X_test_scaled)
        y_pred = y_pred.ravel()
    else:
        model = joblib.load(model_path)
        y_pred = model.predict(X_test_scaled)

    mse_original = mean_squared_error(y_test, y_pred)
    selection_results = {'Original': mse_original}
    feature_selection_data = []  # To store feature selection results
    test_results = []  # To store test results (actual and predicted)

    # Iterate over each feature selection technique
    for name, selector in feature_selection_techniques.items():
        mse_values = []
        for k in range(2, len(data_columns) + 1):  # Iterate over all possible numbers of features
            if isinstance(selector, SelectKBest):
                selector.set_params(k=k)
                X_train_selected = selector.fit_transform(X_train, y_train)
                X_test_selected = selector.transform(X_test)
                selected_features = selector.get_support(indices=True)[:k]  # Get the selected features for KBest
            elif isinstance(selector, VarianceThreshold):
                # Apply the threshold once
                X_train_selected = selector.fit_transform(X_train, y_train)
                X_test_selected = selector.transform(X_test)
                if X_train_selected.shape[1] < k:
                    # If fewer features than k are selected, skip
                    continue
                else:
                    # Otherwise, slice the selected features
                    X_train_selected = X_train_selected[:, :k]
                    X_test_selected = X_test_selected[:, :k]
                    selected_features = np.argsort(-selector.variances_)[:k]  # Get the top k features by variance
            else:
                continue

            # Retrain the model with the selected features
            if model_name == 'TensorFlow':
                # Model setup and training for TensorFlow
                model = tf.keras.Sequential([
                    tf.keras.layers.InputLayer(input_shape=(X_train_selected.shape[1],)),
                    tf.keras.layers.Dense(hyperparameters['units'], activation=hyperparameters['activation']),
                    tf.keras.layers.Dense(1)
                ])
                model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hyperparameters['learning_rate']),
                              loss='mse')
                model.fit(X_train_selected, y_train, epochs=hyperparameters['epochs'], verbose=0)
                y_pred = model.predict(X_test_selected)
                y_pred = y_pred.ravel()
            else:
                # Model setup and training for other models
                model_class = model_mapping.get(model_name)
                if not model_class:
                    print(f"Unknown model name: {model_name}")
                    continue

                model_hyperparameters = {k.split('__', 1)[1]: v for k, v in hyperparameters.items() if
                                         k.startswith(model_name.lower())}
                model = model_class(**model_hyperparameters)
                model.fit(X_train_selected, y_train)
                y_pred = model.predict(X_test_selected)

            # Calculate MSE and store the results
            mse = mean_squared_error(y_test, y_pred)
            mse_values.append(mse)

            # Store feature selection results
            feature_selection_data.append({
                'CSV File': csv_file,
                'Model': model_name,
                'Selection Method': name,
                'Number of Features': k,
                'MSE': mse,
                'Selected Features': [data_columns[i] for i in selected_features]
            })

            # Store test results
            test_results.append({
                'CSV File': csv_file,
                'Model': model_name,
                'Selection Method': name,
                'Number of Features': k,
                'Actual': ','.join(map(str, ["{:.5f}".format(val) for val in y_test.values])),
                'Predicted': ','.join(map(str, ["{:.5f}".format(val) for val in y_pred])),
                'MSE': mse,
                'Selected Features': [data_columns[i] for i in selected_features]
            })
        selection_results[name] = mse_values

    return selection_results, feature_selection_data, test_results

# Main loop for processing each results column
result_columns_list = ['WS', 'NR', 'PR', 'SR', 'SFST', 'WS_Benefit', 'NR_Benefit', 'PR_Benefit', 'SR_Benefit', 'SFST_Benefit']  # Specify your actual target column names here

# Function to convert a list of values to a comma-separated string
def list_to_string(values):
    return ','.join(map(str, values))

# Define a list to hold the results from both norm and non_norm directories
all_results = []

for result_column in result_columns_list:
    print(result_column)
    for norm_type in ['norm','non_norm']:
        model_directory = f"../TrainingResults/{norm_type}/{result_column}"
        best_models_file = f"../TrainingResults/{norm_type}/best_models{result_column}_info.csv"
        print(model_directory)
        print(best_models_file)

        if not os.path.exists(best_models_file):
            continue

        best_models_df = pd.read_csv(best_models_file)

        if best_models_df.empty:
            continue

        # Find the best model from best_models_df
        best_model = best_models_df.loc[best_models_df['rmse'].idxmin()]  # Assuming there's an 'mse' column

        # Load data for the best model
        if norm_type=='norm':
            csv_file = os.path.join('../../../Data_ML/4_out_csvs_regression_norm', best_model['csv_file'])
        elif norm_type=='non_norm':
            csv_file = os.path.join('../../../Data_ML/4_out_csvs_regression', best_model['csv_file'])

        
        model_name = best_model['model_name']
        model_path = os.path.join(model_directory,
                                  f"{best_model['csv_file']}_{model_name}_model.pkl" if model_name != 'TensorFlow' else f"{best_model['csv_file']}_TensorFlow_model.h5")
        hyperparameters = ast.literal_eval(best_model['hyperparameters'])

        # Perform feature selection and retraining for the best model
        reduction_results, feature_selection_data, test_results = load_evaluate_and_retrain_best_model_with_feature_selection(csv_file, model_name, model_path, hyperparameters, result_column)

        # Plot MSE values for each reduction method for the current CSV file
        plt.figure(figsize=(12, 8))

        for name, mse_values in reduction_results.items():
            if name != 'Original':  # Skip 'Original' since it doesn't have varying components
                # Clip the MSE values at 4
                clipped_mse_values = [min(mse, 3) for mse in mse_values]
                plt.plot(range(2, len(clipped_mse_values) + 2), clipped_mse_values, label=name)

                # Find the index of the minimum MSE value (from clipped values)
                min_index = np.argmin(clipped_mse_values)
                min_mse = clipped_mse_values[min_index]
                # Plot the black dot for the minimum MSE value
                plt.plot(min_index + 2, min_mse, 'ko')  # 'ko' is for black dot
                # Annotate the minimum MSE value
                plt.annotate(f'{min_mse:.4f}', (min_index + 2, min_mse),
                             textcoords="offset points", xytext=(0, 10), ha='center')

        # Add original MSE to the plot, clipped if necessary
        original_clipped = min(reduction_results['Original'], 4)
        plt.axhline(y=original_clipped, color='gray', linestyle='--', label='Original')

        # Add labels and legend
        plt.xlabel('Number of Selected Features')
        plt.ylabel('MSE')
        plt.title(f'MSE with Feature Selection for Model: {model_name} {result_column} ({best_model["csv_file"]})')
        plt.legend()
        plt.grid(True)
        plt.ylim(0, 4)  # Set the y-axis to display up to MSE of 4
        plt.tight_layout()
        plt.savefig(f"feature_selection_{result_column}_{norm_type}.png")
        plt.close()

        # Process and save test results
        test_results_df = pd.DataFrame(test_results)
        test_results_df.to_csv(f"test_results_{result_column}_{norm_type}.csv", index=False)

        # Append results to the global list for final analysis
        all_results.extend(feature_selection_data)

# Final analysis can be done here with `all_results`
# Save `all_results` if needed
final_results_df = pd.DataFrame(all_results)
final_results_df.to_csv("final_feature_selection_results.csv", index=False)


WS
../TrainingResults/norm/WS
../TrainingResults/norm/best_modelsWS_info.csv
here
