# PR

In [3]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = [
    'OF22', 'OF26', 'OF27', 'F17', 'F20', 'F21', 'F23',
    'F24',  'F28', 'F29', 'F31', 'F33', 'F34', 'F35', 'F36', 'F38', 'F43', 'F44', 'F45', 'F49', 'F63', 'F65', 
]
# Directory where you want to save your models
model_directory = "PR"
results_columns = [model_directory]

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),TheilSenRegressor()
]
warnings.filterwarnings("ignore")



# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


Processing Ridge() for ../../../Data_ML/4_out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 0.5241212658706805
Processing DecisionTreeRegressor() for ../../../Data_ML/4_out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 0.6363413529390632
Processing GradientBoostingRegressor() for ../../../Data_ML/4_out_csvs_regression\output_bfill_imputed.csv


KeyboardInterrupt: 

# SR

In [None]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = ['OF22', 'OF26', 'OF27', 'F9',  'F17',  'F20', 'F22',  'F28', 'F29', 'F31', 'F33', 'F34', 'F35', 'F36',  'F43', 'F44', 'F45',  'F49',  'S5']

# Directory where you want to save your models
model_directory = "SR"
results_columns = [model_directory]

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),TheilSenRegressor()
]
warnings.filterwarnings("ignore")



# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


# NR

In [None]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = [
    'OF16','OF18', 'OF22', 'OF25', 'OF26', 'OF27', 'F1',  'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F6',
    'F17', 'F18',  'F20', 'F21', 'F22', 'F23',
    'F24',  'F28', 'F31',  'F33', 'F34', 'F36', 'F43', 'F44', 'F45', 'F48', 'F49',  'F54', 'F65', 'S5'
]
# Directory where you want to save your models
model_directory = "NR"
results_columns = [model_directory]

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),TheilSenRegressor()
]
warnings.filterwarnings("ignore")



# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


# WS

In [None]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = ['OF22', 'OF26', 'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F20', 'F21', 'F22', 'F28', 'F31',  'F43', 'F44', 'F45', 'F48', 'F49']

# Directory where you want to save your models
model_directory = "WS"
results_columns = [model_directory]

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),TheilSenRegressor()
]
warnings.filterwarnings("ignore")



# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


# SFST

In [None]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = [
    'F1', 'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F14', 'F17', 'F21',
    'F24', 'F25',  'F29', 'F31',  'F33', 'F34',  'F43', 'F47', 'F48'
]

# Directory where you want to save your models
model_directory = "SFST"
results_columns = [model_directory]

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),TheilSenRegressor()
]
warnings.filterwarnings("ignore")



# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


# WS Benefit

In [1]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = ['OF8', 'OF17', 'OF18', 'OF23', 'OF24','F51' ]

results_columns = ['WS_Benefit']

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),TheilSenRegressor()
]
warnings.filterwarnings("ignore")

# Directory where you want to save your models
model_directory = "WS_Benefit"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


Processing Ridge() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'saga'}
Ridge RMSE: 1.8848853842797528
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 0.6715891867163182
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 4.083722805842152
Processing RandomForestReg



TensorFlow RMSE: 2.2621494722183244
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'sag'}
Ridge RMSE: 1.8999574897948137
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.500418100460827
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 4.08372



TensorFlow RMSE: 2.057158305222768
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'sag'}
Ridge RMSE: 1.8774241105756202
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 1, 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.3328404360523394
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 4.083722805842



TensorFlow RMSE: 2.0290451925674144
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'sag'}
Ridge RMSE: 1.8791450929430225
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 2, 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 2.1531910068325946
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 4.083722805842152



TensorFlow RMSE: 2.105431225075473
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'saga'}
Ridge RMSE: 1.874135523693133
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 0.8799435858540673
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step




TensorFlow RMSE: 2.271535947106056
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'saga'}
Ridge RMSE: 1.8678069431886781
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 2, 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.0604813597068616
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE:



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step




TensorFlow RMSE: 2.1587577946735625
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'cholesky'}
Ridge RMSE: 1.87606467296504
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 3, 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.971390088478782
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 4.0837228058421



TensorFlow RMSE: 2.174785017052277
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'saga'}
Ridge RMSE: 1.9005464791839861
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 2, 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.5397200387145216
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 4.083722805



TensorFlow RMSE: 2.2688944611943596
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'saga'}
Ridge RMSE: 1.887360512732951
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 0.9238432342278484
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'quantile', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 4.0837228058



TensorFlow RMSE: 2.2640221102734284
Best models information saved to best_models_info.csv


# NR Benefit

In [2]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = [
     'OF9', 'OF10', 'OF11', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24', 'F13',
    'F41', 'F50', 'F51', 'F52'
]

results_columns = ['NR_Benefit']

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),
    TheilSenRegressor()
]
warnings.filterwarnings("ignore")

# Directory where you want to save your models
model_directory = "NR_Benefit"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


Processing Ridge() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.9764256972076324
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.5951951730780909
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 1.675527622066697
Processing RandomFore



TensorFlow RMSE: 1.3388222876635398
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 2.325168317076102
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.7415362667634517
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 



TensorFlow RMSE: 1.3193643886515654
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.9758929750246501
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 0.9738755210104013
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 1



TensorFlow RMSE: 1.6406033659198516
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.968691851276344
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 0.9331421211444524
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 1.690



TensorFlow RMSE: 1.5151488878942534
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.9158817014567309
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.7070929480650237
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegress



TensorFlow RMSE: 1.064408409688842
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.9690305169794569
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 3, 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 0.8155095982034207
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor



TensorFlow RMSE: 1.184623504821488
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.9534431130473737
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.3800255432418633
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 1.6781



TensorFlow RMSE: 1.3262497713936254
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.9747918513896645
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.6408662092524993
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE



TensorFlow RMSE: 1.8556704506279744
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.995063878293594
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.0088170406117116
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 1.67



TensorFlow RMSE: 1.2565763181570297
Best models information saved to best_models_info.csv


# PR Benefit

In [3]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = [
    
    'OF18', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24',
    'F41',  'F48', 'F50',  'F52'
]

results_columns = ['PR_Benefit']

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),
    TheilSenRegressor()
]
warnings.filterwarnings("ignore")

# Directory where you want to save your models
model_directory = "PR_Benefit"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


Processing Ridge() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.5983850152198624
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.4611910788968083
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 1.5425507592071632
Processing RandomF



TensorFlow RMSE: 0.9423269933540463
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.8498215085333616
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.160414647771177
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 1



TensorFlow RMSE: 0.7639010179102226
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.5932409873961457
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 0.7348835346664188
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 1.



TensorFlow RMSE: 0.4805591890680098
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.5973250775957248
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.532533026133687
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 1.54255



TensorFlow RMSE: 0.6013923530657077
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.5387870896853686
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 3, 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 0.9712485718701409
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor R



TensorFlow RMSE: 0.4837643843534758
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.594966654951807
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 3, 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 0.920277908336079
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor 



TensorFlow RMSE: 0.646494211368542
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.5784607639760582
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 3, 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 0.9346681101816176
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 1.5425507592



TensorFlow RMSE: 0.9719799555044923
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.5945442602517494
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.9258764547726364
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 



TensorFlow RMSE: 0.5369005858088968
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.5995158200087412
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.1358144969265762
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.01, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 1.54255



TensorFlow RMSE: 0.4539205866477782
Best models information saved to best_models_info.csv


# SR Benefit

In [4]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = ['OF18', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24','F24', 'F28','F41', 'F50', 'F52', 'F55', 'S4', ]

results_columns = ['SR_Benefit']

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),
    TheilSenRegressor()
]
warnings.filterwarnings("ignore")

# Directory where you want to save your models
model_directory = "SR_Benefit"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


Processing Ridge() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'svd'}
Ridge RMSE: 1.1287751101652872
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 1, 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 2.7580886778278004
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 3.384579436999508
Processing RandomForestRegressor(



TensorFlow RMSE: 0.8581672155871881
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'sparse_cg'}
Ridge RMSE: 1.4723363104253813
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 3.5993440143077917
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 3.3



TensorFlow RMSE: 0.4703402620820752
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'sparse_cg'}
Ridge RMSE: 1.116443326477427
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 0.9507184250920583
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 3.384



TensorFlow RMSE: 0.4090198784300626
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'sparse_cg'}
Ridge RMSE: 1.1147700814830763
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 2, 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.1299952380852045
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 3.38457943699



TensorFlow RMSE: 0.5793497731856858
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'svd'}
Ridge RMSE: 1.0581495690284433
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 3, 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.6718572934086837
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 3.3



TensorFlow RMSE: 0.602006642884619
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'lsqr'}
Ridge RMSE: 1.1295996888248157
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.6461768875907883
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RM



TensorFlow RMSE: 0.4913290513582919
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'svd'}
Ridge RMSE: 1.1000789660598869
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 1, 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 2.6558153801296758
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 3.384579436999508
Pro



TensorFlow RMSE: 0.6408875564044197
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'sparse_cg'}
Ridge RMSE: 1.1182279126713577
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.6032701960145617
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': False}
GradientBoostingRegressor RMSE: 3



TensorFlow RMSE: 0.7446507102728493
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 0.1, 'ridge__solver': 'sparse_cg'}
Ridge RMSE: 1.1182279126713577
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 3, 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 1.6029381430598595
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 3.384579436999



TensorFlow RMSE: 0.6513328153298242
Best models information saved to best_models_info.csv


# SFST Benefit

In [6]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA  # Import PCA
import warnings
import joblib
import os

# Define the data columns and results columns
data_columns = ['OF18', 'OF22','OF25',  'OF27', 'OF28',  'F50']

results_columns = ['SFST_Benefit']

# Define the parameter grid for GridSearchCV
param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2, 5],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01],
        'gradientboostingregressor__n_estimators': [25, 50, 100],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100)],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [0.25, 0.5, 0.75],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],
    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}

models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
    KNeighborsRegressor(), MLPRegressor(max_iter=1000), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
   BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(),
    TheilSenRegressor()
]
warnings.filterwarnings("ignore")

# Directory where you want to save your models
model_directory = "SFST_Benefit"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Function to process each CSV file
def process_csv(file_path):
    data = pd.read_csv(file_path)
    X = data[data_columns]
    y = data[results_columns[0]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    best_model_info = {
        'csv_file': os.path.basename(file_path),
        'model_name': None,
        'hyperparameters': None,
        'rmse': float('inf')
    }

    results = []

    for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
        print(f"Processing {model} for {file_path}")
        if model == 'TensorFlow':
            # Define the TensorFlow model
            model_tf = tf.keras.models.Sequential([
                tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.Dense(1)
            ])

            # Compile the TensorFlow model
            model_tf.compile(optimizer='adam', loss='mean_squared_error')

            # Standardize the data for TensorFlow model
            scaler_tf = StandardScaler()
            X_train_scaled_tf = scaler_tf.fit_transform(X_train)
            X_test_scaled_tf = scaler_tf.transform(X_test)

            # Train the TensorFlow model
            model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

            # Evaluate the TensorFlow model
            y_pred_tf = model_tf.predict(X_test_scaled_tf)
            rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
            print(f"TensorFlow RMSE: {rmse_tf}")

            if rmse_tf < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': 'TensorFlow',
                    'hyperparameters': None,
                    'rmse': rmse_tf
                })

            # Save the TensorFlow model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_TensorFlow_model.h5")
            model_tf.save(model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred_tf.flatten(), 'Model': 'TensorFlow'}))
        else:
            model_name = model.__class__.__name__
            pipeline = make_pipeline(StandardScaler(), model)
            # Perform grid search for hyperparameters
            if model_name in param_grid:
                grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)
                best_estimator = grid_search.best_estimator_
                best_params = grid_search.best_params_
                print(f"Best hyperparameters for {model_name}: {best_params}")
            else:
                pipeline.fit(X_train, y_train)
                best_estimator = pipeline
                best_params = None

            # Make predictions
            y_pred = best_estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{model_name} RMSE: {rmse}")

            if rmse < best_model_info['rmse']:
                best_model_info.update({
                    'model_name': model_name,
                    'hyperparameters': best_params,
                    'rmse': rmse
                })

            # Save the model
            model_filename = os.path.join(model_directory, f"{os.path.basename(file_path)}_{model_name}_model.pkl")
            joblib.dump(best_estimator, model_filename)

            # Save the predictions and actual values
            results.append(pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': model_name}))

    # Save the predictions and actual values to a CSV file
    results_df = pd.concat(results, axis=0)
    results_filename = f"output_{os.path.basename(file_path)}_{results_columns[0]}.csv"
    results_df.to_csv(results_filename, index=False)

    return best_model_info

# Get the list of CSV files in the directory
csv_files = glob.glob('../../../Data_ML/4_out_csvs_regression/*.csv')

# Initialize a list to store the best model information for each CSV file
best_models_info = []

# Process each CSV file
for csv_file in csv_files:
    best_model_info = process_csv(csv_file)
    best_models_info.append(best_model_info)

# Save the best model information for each CSV file to a CSV file
best_models_df = pd.DataFrame(best_models_info)
best_models_df.to_csv("best_models"+results_columns[0]+"_info.csv", index=False)

print("Best models information saved to best_models_info.csv")


Processing Ridge() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'sag'}
Ridge RMSE: 1.3950050782333883
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 1, 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.8617692166515776
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_bfill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 2.1776267574666877
Processing RandomForestRegressor(



TensorFlow RMSE: 2.047804415322723
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'cholesky'}
Ridge RMSE: 1.3934750579470139
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 1, 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 2.304956502017007
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_custom_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 2.1776267



TensorFlow RMSE: 1.8486653588767472
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'sag'}
Ridge RMSE: 1.398134773289042
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.7133056324385043
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_ffill_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 2.17762675746



TensorFlow RMSE: 1.9635211530837737
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.3944610261314925
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.4744590210520958
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_interpolated.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 2.1776267574



TensorFlow RMSE: 2.0467587563928933
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'cholesky'}
Ridge RMSE: 1.3934750579470139
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 2, 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 2.2532788371298262
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_iterative_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE:



TensorFlow RMSE: 2.2336845105710648
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.395144230337694
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.3292752764502958
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_knn_imputed_custom.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMS



TensorFlow RMSE: 2.3374726566186594
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
Ridge RMSE: 1.382456118132558
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 2.1398959559709794
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mean_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 2.17762675746



TensorFlow RMSE: 1.6595057959069115
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'cholesky'}
Ridge RMSE: 1.3934750579470139
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
DecisionTreeRegressor RMSE: 1.4516433197282452
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_median_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 2.17



TensorFlow RMSE: 2.6054065701042286
Processing Ridge() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'sag'}
Ridge RMSE: 1.4066997734955788
Processing DecisionTreeRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'friedman_mse', 'decisiontreeregressor__max_features': 2, 'decisiontreeregressor__min_samples_split': 3, 'decisiontreeregressor__splitter': 'best'}
DecisionTreeRegressor RMSE: 2.0829475595209512
Processing GradientBoostingRegressor() for ../../Data_ML/out_csvs_regression\output_mode_imputed.csv
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.001, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 25, 'gradientboostingregressor__warm_start': True}
GradientBoostingRegressor RMSE: 2.1776267574666877
Pro



TensorFlow RMSE: 2.106899585761205
Best models information saved to best_models_info.csv
