In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge

from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.decomposition import PCA  # Import PCA









# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF2',
    'OF3',
    'OF4',
    'OF5',
    'OF6',
    'OF7',
    'OF8',
    'OF9',
    'OF10',
    'OF11',
    'OF12',
    'OF13',
    'OF14',
    'OF15',
    'OF16',
    'OF17',
    'OF18',
    'OF19',
    'OF20',
    'OF21',
    'OF22',
    'OF23',
    'OF24',
    'OF25',
    'OF26',
    'OF27',
    'OF28',
    'OF29',
    'OF30',
    'OF31',
    'OF32',
    'OF33',
    'OF34',
    'OF37',
    'OF38',
    'F1',
    'F2',
    'F3_1',
    'F3_2',
    'F3_3',
    'F3_4',
    'F3_5',
    'F3_6',
    'F3_7',
    'F4',
    'F5',
    'F6',
    'F7',
    'F8',
    'F9',
    'F10',
    'F11',
    'F12',
    'F13',
    'F14',
    'F15',
    'F16',
    'F17',
    'F18',
    'F19',
    'F20',
    'F21',
    'F22',
    'F23',
    'F24',
    'F25',
    'F26',
    'F27',
    'F28',
    'F29',
    'F30',
    'F31',
    'F32',
    'F33',
    'F34',
    'F35',
    'F36',
    'F37',
    'F38',
    'F39',
    'F40',
    'F41',
    'F42',
    'F43',
    'F44',
    'F45',
    'F46',
    'F47',
    'F48',
    'F49',
    'F50',
    'F51',
    'F52',
    'F53',
    'F54',
    'F55',
    'F56',
    'F57',
    'F58',
    'F59',
    'F62',
    'F63',
    'F64',
    'F65',
    'F66',
    'F67',
    'F68',
    'S1',
    'S2',
    'S3',
    'S4',
    'S5',
    'S6',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['WS']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

pca = PCA(n_components='mle')  # Retain 95% of variance
X_pca = pca.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2 , 5,],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1],
        'gradientboostingregressor__n_estimators': [25, 50, 100,250],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100),],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [ 0.25, 0.5, 0.75,],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],

    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [ 1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [ 1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}


models = [Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(),AdaBoostRegressor(), KNeighborsRegressor(), MLPRegressor(max_iter=200),ElasticNet(max_iter=1000),SGDRegressor(max_iter=1000),SVR(cache_size=1000),BayesianRidge(max_iter=1000),KernelRidge(),LinearRegression(), RANSACRegressor(), TheilSenRegressor()]

import warnings
warnings.filterwarnings("ignore")



# Train and tune hyperparameters for each model
best_models = {}

for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
    print(model)
    if model == 'TensorFlow':
        # Define the TensorFlow model
        model_tf = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        # Compile the TensorFlow model
        model_tf.compile(optimizer='adam', loss='mean_squared_error')

        # Standardize the data for TensorFlow model
        scaler_tf = StandardScaler()
        X_train_scaled_tf = scaler_tf.fit_transform(X_train)
        X_test_scaled_tf = scaler_tf.transform(X_test)

        # Train the TensorFlow model
        model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2)

        # Evaluate the TensorFlow model
        y_pred_tf = model_tf.predict(X_test_scaled_tf)
        rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
        print(f"TensorFlow RMSE: {rmse_tf}")

        # Add TensorFlow model to best_models
        best_models['TensorFlow'] = model_tf
    else:
        model_name = model.__class__.__name__
        pipeline = make_pipeline(StandardScaler(), model)
        # Perform grid search for hyperparameters
        if model_name in param_grid:
            grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)
            best_models[model_name] = grid_search.best_estimator_
            print(f"Best hyperparameters for {model_name}: {grid_search.best_params_}")
        else:
            pipeline.fit(X_train, y_train)
            best_models[model_name] = pipeline



import os
import tensorflow as tf
import joblib

scaler_filename = "WS_scaler_tf.pkl"
joblib.dump(scaler_tf, scaler_filename)


# Directory where you want to save your models
model_directory = "WS"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Make predictions using the best models
for model_name, model in best_models.items():
    print(f"Model: {model_name}")

    # Make predictions
    if model_name == 'TensorFlow':
        y_pred = y_pred_tf  # Use predictions from TensorFlow model
        # Save the TensorFlow model
        model_filename = os.path.join(model_directory, f"{model_name}_model.h5")
        model.save(model_filename)
        #print(f"Model saved as {model_filename}")
    else:
        y_pred = model.predict(X_test)

        # Save the other models using joblib
        model_filename = os.path.join(model_directory, f"{model_name}_model.pkl")
        joblib.dump(model, model_filename)
        #print(f"Model saved as {model_filename}")

    # Calculate and print RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE: {rmse}")

    # Show real and predicted results for the first 5 samples
    #print("Sample predictions:")
   # for i in range(5):
    #    print(f"Sample {i+1}: Real NR = {y_test.iloc[i]}, Predicted NR = {y_pred[i]}")

    print("\n")

Ridge()
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
DecisionTreeRegressor()
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 1, 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'random'}
GradientBoostingRegressor()
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.1, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 250, 'gradientboostingregressor__warm_start': True}
RandomForestRegressor()
Best hyperparameters for RandomForestRegressor: {'randomforestregressor__criterion': 'friedman_mse', 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 50}
AdaBoostRegressor()
Best hyperparameters for AdaBoostRegressor: {'adaboostregressor__learning_rate': 1.0, 'adaboos



RMSE: 1.9347288999067873


Model: KNeighborsRegressor
RMSE: 2.5676767810680814


Model: MLPRegressor
RMSE: 1.4401968251265855


Model: ElasticNet
RMSE: 2.2196443573402043


Model: SGDRegressor
RMSE: 2.880370013194233


Model: SVR
RMSE: 5.973673448454882


Model: BayesianRidge
RMSE: 3.271775032311624


Model: KernelRidge
RMSE: 3.432849644633296


Model: LinearRegression
RMSE: 302.25841010313803


Model: RANSACRegressor
RMSE: 35.5484694937987


Model: TheilSenRegressor
RMSE: 227.11283428149946


Model: TensorFlow
RMSE: 10.981475361698914




In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge

from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.decomposition import PCA  # Import PCA









# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF2',
    'OF3',
    'OF4',
    'OF5',
    'OF6',
    'OF7',
    'OF8',
    'OF9',
    'OF10',
    'OF11',
    'OF12',
    'OF13',
    'OF14',
    'OF15',
    'OF16',
    'OF17',
    'OF18',
    'OF19',
    'OF20',
    'OF21',
    'OF22',
    'OF23',
    'OF24',
    'OF25',
    'OF26',
    'OF27',
    'OF28',
    'OF29',
    'OF30',
    'OF31',
    'OF32',
    'OF33',
    'OF34',
    'OF37',
    'OF38',
    'F1',
    'F2',
    'F3_1',
    'F3_2',
    'F3_3',
    'F3_4',
    'F3_5',
    'F3_6',
    'F3_7',
    'F4',
    'F5',
    'F6',
    'F7',
    'F8',
    'F9',
    'F10',
    'F11',
    'F12',
    'F13',
    'F14',
    'F15',
    'F16',
    'F17',
    'F18',
    'F19',
    'F20',
    'F21',
    'F22',
    'F23',
    'F24',
    'F25',
    'F26',
    'F27',
    'F28',
    'F29',
    'F30',
    'F31',
    'F32',
    'F33',
    'F34',
    'F35',
    'F36',
    'F37',
    'F38',
    'F39',
    'F40',
    'F41',
    'F42',
    'F43',
    'F44',
    'F45',
    'F46',
    'F47',
    'F48',
    'F49',
    'F50',
    'F51',
    'F52',
    'F53',
    'F54',
    'F55',
    'F56',
    'F57',
    'F58',
    'F59',
    'F62',
    'F63',
    'F64',
    'F65',
    'F66',
    'F67',
    'F68',
    'S1',
    'S2',
    'S3',
    'S4',
    'S5',
    'S6',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['SR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

pca = PCA(n_components='mle')  # Retain 95% of variance
X_pca = pca.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2 , 5,],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1],
        'gradientboostingregressor__n_estimators': [25, 50, 100,250],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100),],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [ 0.25, 0.5, 0.75,],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],

    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [ 1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [ 1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}


models = [Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(),AdaBoostRegressor(), KNeighborsRegressor(), MLPRegressor(max_iter=200),ElasticNet(max_iter=1000),SGDRegressor(max_iter=1000),SVR(cache_size=1000),BayesianRidge(max_iter=1000),KernelRidge(),LinearRegression(), RANSACRegressor(), TheilSenRegressor()]

import warnings
warnings.filterwarnings("ignore")



# Train and tune hyperparameters for each model
best_models = {}

for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
    print(model)
    if model == 'TensorFlow':
        # Define the TensorFlow model
        model_tf = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        # Compile the TensorFlow model
        model_tf.compile(optimizer='adam', loss='mean_squared_error')

        # Standardize the data for TensorFlow model
        scaler_tf = StandardScaler()
        X_train_scaled_tf = scaler_tf.fit_transform(X_train)
        X_test_scaled_tf = scaler_tf.transform(X_test)

        # Train the TensorFlow model
        model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2)

        # Evaluate the TensorFlow model
        y_pred_tf = model_tf.predict(X_test_scaled_tf)
        rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
        print(f"TensorFlow RMSE: {rmse_tf}")

        # Add TensorFlow model to best_models
        best_models['TensorFlow'] = model_tf
    else:
        model_name = model.__class__.__name__
        pipeline = make_pipeline(StandardScaler(), model)
        # Perform grid search for hyperparameters
        if model_name in param_grid:
            grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)
            best_models[model_name] = grid_search.best_estimator_
            print(f"Best hyperparameters for {model_name}: {grid_search.best_params_}")
        else:
            pipeline.fit(X_train, y_train)
            best_models[model_name] = pipeline




scaler_filename = "SR_scaler_tf.pkl"
joblib.dump(scaler_tf, scaler_filename)

import os
import tensorflow as tf
import joblib
# Directory where you want to save your models
model_directory = "SR"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Make predictions using the best models
for model_name, model in best_models.items():
    print(f"Model: {model_name}")

    # Make predictions
    if model_name == 'TensorFlow':
        y_pred = y_pred_tf  # Use predictions from TensorFlow model
        # Save the TensorFlow model
        model_filename = os.path.join(model_directory, f"{model_name}_model.h5")
        model.save(model_filename)
        #print(f"Model saved as {model_filename}")
    else:
        y_pred = model.predict(X_test)

        # Save the other models using joblib
        model_filename = os.path.join(model_directory, f"{model_name}_model.pkl")
        joblib.dump(model, model_filename)
        #print(f"Model saved as {model_filename}")

    # Calculate and print RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE: {rmse}")

    # Show real and predicted results for the first 5 samples
    #print("Sample predictions:")
   # for i in range(5):
    #    print(f"Sample {i+1}: Real NR = {y_test.iloc[i]}, Predicted NR = {y_pred[i]}")

    print("\n")

Ridge()
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
DecisionTreeRegressor()
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'absolute_error', 'decisiontreeregressor__max_features': 'sqrt', 'decisiontreeregressor__min_samples_split': 5, 'decisiontreeregressor__splitter': 'random'}
GradientBoostingRegressor()
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.1, 'gradientboostingregressor__loss': 'huber', 'gradientboostingregressor__n_estimators': 250, 'gradientboostingregressor__warm_start': False}
RandomForestRegressor()
Best hyperparameters for RandomForestRegressor: {'randomforestregressor__criterion': 'friedman_mse', 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 100}
AdaBoostRegressor()
Best hyperparameters for AdaBoostRegressor: {'adaboostregressor__learning_rate': 1.0, 'adaboos







[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
TensorFlow RMSE: 18.266800915905105
Model: Ridge
RMSE: 2.7416689134378043


Model: DecisionTreeRegressor
RMSE: 3.8820690426033755


Model: GradientBoostingRegressor
RMSE: 1.952565262079683


Model: RandomForestRegressor
RMSE: 2.390240991583788


Model: AdaBoostRegressor
RMSE: 1.7662717602904512


Model: KNeighborsRegressor
RMSE: 2.5483244821008255


Model: MLPRegressor
RMSE: 2.3380229576739384


Model: ElasticNet
RMSE: 2.1932454571632563


Model: SGDRegressor
RMSE: 2.863368512482077


Model: SVR
RMSE: 2.184281599358465


Model: BayesianRidge
RMSE: 2.5230986637158646


Model: KernelRidge
RMSE: 12.71682107465145


Model: LinearRegression
RMSE: 196.9838158074395


Model: RANSACRegressor
RMSE: 749.9827284514041


Model: TheilSenRegressor
RMSE: 86.80863812783315


Model: TensorFlow




RMSE: 18.266800915905105




In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge

from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.decomposition import PCA  # Import PCA









# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF2',
    'OF3',
    'OF4',
    'OF5',
    'OF6',
    'OF7',
    'OF8',
    'OF9',
    'OF10',
    'OF11',
    'OF12',
    'OF13',
    'OF14',
    'OF15',
    'OF16',
    'OF17',
    'OF18',
    'OF19',
    'OF20',
    'OF21',
    'OF22',
    'OF23',
    'OF24',
    'OF25',
    'OF26',
    'OF27',
    'OF28',
    'OF29',
    'OF30',
    'OF31',
    'OF32',
    'OF33',
    'OF34',
    'OF37',
    'OF38',
    'F1',
    'F2',
    'F3_1',
    'F3_2',
    'F3_3',
    'F3_4',
    'F3_5',
    'F3_6',
    'F3_7',
    'F4',
    'F5',
    'F6',
    'F7',
    'F8',
    'F9',
    'F10',
    'F11',
    'F12',
    'F13',
    'F14',
    'F15',
    'F16',
    'F17',
    'F18',
    'F19',
    'F20',
    'F21',
    'F22',
    'F23',
    'F24',
    'F25',
    'F26',
    'F27',
    'F28',
    'F29',
    'F30',
    'F31',
    'F32',
    'F33',
    'F34',
    'F35',
    'F36',
    'F37',
    'F38',
    'F39',
    'F40',
    'F41',
    'F42',
    'F43',
    'F44',
    'F45',
    'F46',
    'F47',
    'F48',
    'F49',
    'F50',
    'F51',
    'F52',
    'F53',
    'F54',
    'F55',
    'F56',
    'F57',
    'F58',
    'F59',
    'F62',
    'F63',
    'F64',
    'F65',
    'F66',
    'F67',
    'F68',
    'S1',
    'S2',
    'S3',
    'S4',
    'S5',
    'S6',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['NR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

pca = PCA(n_components='mle')  # Retain 95% of variance
X_pca = pca.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2 , 5,],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1],
        'gradientboostingregressor__n_estimators': [25, 50, 100,250],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100),],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [ 0.25, 0.5, 0.75,],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],

    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [ 1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [ 1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}


models = [Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(),AdaBoostRegressor(), KNeighborsRegressor(), MLPRegressor(max_iter=200),ElasticNet(max_iter=1000),SGDRegressor(max_iter=1000),SVR(cache_size=1000),BayesianRidge(max_iter=1000),KernelRidge(),LinearRegression(), RANSACRegressor(), TheilSenRegressor()]

import warnings
warnings.filterwarnings("ignore")



# Train and tune hyperparameters for each model
best_models = {}

for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
    print(model)
    if model == 'TensorFlow':
        # Define the TensorFlow model
        model_tf = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        # Compile the TensorFlow model
        model_tf.compile(optimizer='adam', loss='mean_squared_error')

        # Standardize the data for TensorFlow model
        scaler_tf = StandardScaler()
        X_train_scaled_tf = scaler_tf.fit_transform(X_train)
        X_test_scaled_tf = scaler_tf.transform(X_test)

        # Train the TensorFlow model
        model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2)

        # Evaluate the TensorFlow model
        y_pred_tf = model_tf.predict(X_test_scaled_tf)
        rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
        print(f"TensorFlow RMSE: {rmse_tf}")

        # Add TensorFlow model to best_models
        best_models['TensorFlow'] = model_tf
    else:
        model_name = model.__class__.__name__
        pipeline = make_pipeline(StandardScaler(), model)
        # Perform grid search for hyperparameters
        if model_name in param_grid:
            grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)
            best_models[model_name] = grid_search.best_estimator_
            print(f"Best hyperparameters for {model_name}: {grid_search.best_params_}")
        else:
            pipeline.fit(X_train, y_train)
            best_models[model_name] = pipeline




scaler_filename = "NR_scaler_tf.pkl"
joblib.dump(scaler_tf, scaler_filename)

import os
import tensorflow as tf
import joblib
# Directory where you want to save your models
model_directory = "NR"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Make predictions using the best models
for model_name, model in best_models.items():
    print(f"Model: {model_name}")

    # Make predictions
    if model_name == 'TensorFlow':
        y_pred = y_pred_tf  # Use predictions from TensorFlow model
        # Save the TensorFlow model
        model_filename = os.path.join(model_directory, f"{model_name}_model.h5")
        model.save(model_filename)
        #print(f"Model saved as {model_filename}")
    else:
        y_pred = model.predict(X_test)

        # Save the other models using joblib
        model_filename = os.path.join(model_directory, f"{model_name}_model.pkl")
        joblib.dump(model, model_filename)
        #print(f"Model saved as {model_filename}")

    # Calculate and print RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE: {rmse}")

    # Show real and predicted results for the first 5 samples
    #print("Sample predictions:")
   # for i in range(5):
    #    print(f"Sample {i+1}: Real NR = {y_test.iloc[i]}, Predicted NR = {y_pred[i]}")

    print("\n")

Ridge()
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
DecisionTreeRegressor()
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'squared_error', 'decisiontreeregressor__max_features': 'log2', 'decisiontreeregressor__min_samples_split': 2, 'decisiontreeregressor__splitter': 'best'}
GradientBoostingRegressor()
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.1, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__warm_start': False}
RandomForestRegressor()
Best hyperparameters for RandomForestRegressor: {'randomforestregressor__criterion': 'absolute_error', 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 50}
AdaBoostRegressor()
Best hyperparameters for AdaBoostRegressor: {'adaboostregressor__learning_rate': 1.0, 'a



RMSE: 3.004760053671302


Model: MLPRegressor
RMSE: 1.769447869975285


Model: ElasticNet
RMSE: 2.4341424924551833


Model: SGDRegressor
RMSE: 4.4291122142737605


Model: SVR
RMSE: 6.198189104831161


Model: BayesianRidge
RMSE: 4.439034389572746


Model: KernelRidge
RMSE: 4.441067200757756


Model: LinearRegression
RMSE: 235.53194201784382


Model: RANSACRegressor
RMSE: 5.998286518312221


Model: TheilSenRegressor
RMSE: 183.01128895235414


Model: TensorFlow
RMSE: 14.08778377725483




In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge

from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.decomposition import PCA  # Import PCA









# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF2',
    'OF3',
    'OF4',
    'OF5',
    'OF6',
    'OF7',
    'OF8',
    'OF9',
    'OF10',
    'OF11',
    'OF12',
    'OF13',
    'OF14',
    'OF15',
    'OF16',
    'OF17',
    'OF18',
    'OF19',
    'OF20',
    'OF21',
    'OF22',
    'OF23',
    'OF24',
    'OF25',
    'OF26',
    'OF27',
    'OF28',
    'OF29',
    'OF30',
    'OF31',
    'OF32',
    'OF33',
    'OF34',
    'OF37',
    'OF38',
    'F1',
    'F2',
    'F3_1',
    'F3_2',
    'F3_3',
    'F3_4',
    'F3_5',
    'F3_6',
    'F3_7',
    'F4',
    'F5',
    'F6',
    'F7',
    'F8',
    'F9',
    'F10',
    'F11',
    'F12',
    'F13',
    'F14',
    'F15',
    'F16',
    'F17',
    'F18',
    'F19',
    'F20',
    'F21',
    'F22',
    'F23',
    'F24',
    'F25',
    'F26',
    'F27',
    'F28',
    'F29',
    'F30',
    'F31',
    'F32',
    'F33',
    'F34',
    'F35',
    'F36',
    'F37',
    'F38',
    'F39',
    'F40',
    'F41',
    'F42',
    'F43',
    'F44',
    'F45',
    'F46',
    'F47',
    'F48',
    'F49',
    'F50',
    'F51',
    'F52',
    'F53',
    'F54',
    'F55',
    'F56',
    'F57',
    'F58',
    'F59',
    'F62',
    'F63',
    'F64',
    'F65',
    'F66',
    'F67',
    'F68',
    'S1',
    'S2',
    'S3',
    'S4',
    'S5',
    'S6',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['PR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

pca = PCA(n_components='mle')  # Retain 95% of variance
X_pca = pca.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

param_grid = {
    'Ridge': {
        'ridge__alpha': [0.1, 0.5, 1.0],
        'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    'DecisionTreeRegressor': {
        'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'decisiontreeregressor__splitter': ['best', 'random'],
        'decisiontreeregressor__min_samples_split': [1, 2, 3, 4, 5],
        'decisiontreeregressor__max_features': [0, 1, 2, 3, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'randomforestregressor__n_estimators': [1, 50, 100],
        'randomforestregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'randomforestregressor__min_samples_split': [2 , 5,],
        'randomforestregressor__max_features': [1, 3, 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'gradientboostingregressor__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
        'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1],
        'gradientboostingregressor__n_estimators': [25, 50, 100,250],
        'gradientboostingregressor__warm_start': [True, False],
    },
    'AdaBoostRegressor': {
        'adaboostregressor__n_estimators': [1, 20, 50, 100],
        'adaboostregressor__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10],
        'adaboostregressor__loss': ['linear', 'square', 'exponential']
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [2, 5, 10, 25],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'kneighborsregressor__leaf_size': [5, 30, 50],
        'kneighborsregressor__metric': ['cityblock', 'cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan', 'nan_euclidean']
    },
    'MLPRegressor': {
        'mlpregressor__hidden_layer_sizes': [(50, 50, 50), (100, 100, 100), (100, 100, 100, 100),],
        'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpregressor__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpregressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    },
    'ElasticNet': {
        'elasticnet__l1_ratio': [ 0.25, 0.5, 0.75,],
        'elasticnet__fit_intercept': [True, False],
        'elasticnet__precompute': [True, False],
        'elasticnet__copy_X': [True, False],
        'elasticnet__warm_start': [True, False],
        'elasticnet__positive': [True, False],
        'elasticnet__selection': ['cyclic', 'random']
    },
    'SGDRegressor': {
        'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
        'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgdregressor__warm_start': [True, False],

    },
    'SVR': {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'svr__degree': [1, 3, 5, 10],
        'svr__gamma': ['scale', 'auto', 1.0, 5.0],
        'svr__shrinking': [True, False]
    },
    'BayesianRidge': {
        'bayesianridge__alpha_1': [1e-7, 1e-6, 1e-5],
        'bayesianridge__alpha_2': [1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_1': [ 1e-7, 1e-6, 1e-5],
        'bayesianridge__lambda_2': [ 1e-7, 1e-6, 1e-5],
    },
    'KernelRidge': {
        'kernelridge__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'kernelridge__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'kernelridge__degree': [1, 2, 3, 5, 10],
        'kernelridge__coef0': [0.0, 0.5, 1.0]
    },
    'LinearRegression': {
        'linearregression__fit_intercept': [True, False],
        'linearregression__copy_X': [True, False],
        'linearregression__positive': [True, False]
    },
    'RANSACRegressor': {
        'ransacregressor__min_samples': [None, 1, 2, 5, 10, 50],
        'ransacregressor__max_trials': [1, 10, 50, 100, 150],
        'ransacregressor__loss': ['absolute_error', 'squared_error']
    },
    'TheilSenRegressor': {
        'theilsenregressor__max_subpopulation': [1, 10, 100, 1000],
        'theilsenregressor__n_subsamples': [None, 1, 5, 10, 25],
    }
}


models = [Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(),AdaBoostRegressor(), KNeighborsRegressor(), MLPRegressor(max_iter=200),ElasticNet(max_iter=1000),SGDRegressor(max_iter=1000),SVR(cache_size=1000),BayesianRidge(max_iter=1000),KernelRidge(),LinearRegression(), RANSACRegressor(), TheilSenRegressor()]

import warnings
warnings.filterwarnings("ignore")



# Train and tune hyperparameters for each model
best_models = {}

for model in models + ['TensorFlow']:  # Add TensorFlow model to the loop
    print(model)
    if model == 'TensorFlow':
        # Define the TensorFlow model
        model_tf = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        # Compile the TensorFlow model
        model_tf.compile(optimizer='adam', loss='mean_squared_error')

        # Standardize the data for TensorFlow model
        scaler_tf = StandardScaler()
        X_train_scaled_tf = scaler_tf.fit_transform(X_train)
        X_test_scaled_tf = scaler_tf.transform(X_test)

        # Train the TensorFlow model
        model_tf.fit(X_train_scaled_tf, y_train, epochs=100, batch_size=32, validation_split=0.2)

        # Evaluate the TensorFlow model
        y_pred_tf = model_tf.predict(X_test_scaled_tf)
        rmse_tf = mean_squared_error(y_test, y_pred_tf, squared=False)
        print(f"TensorFlow RMSE: {rmse_tf}")

        # Add TensorFlow model to best_models
        best_models['TensorFlow'] = model_tf
    else:
        model_name = model.__class__.__name__
        pipeline = make_pipeline(StandardScaler(), model)
        # Perform grid search for hyperparameters
        if model_name in param_grid:
            grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)
            best_models[model_name] = grid_search.best_estimator_
            print(f"Best hyperparameters for {model_name}: {grid_search.best_params_}")
        else:
            pipeline.fit(X_train, y_train)
            best_models[model_name] = pipeline




scaler_filename = "PR_scaler_tf.pkl"
joblib.dump(scaler_tf, scaler_filename)

import os
import tensorflow as tf
import joblib
# Directory where you want to save your models
model_directory = "PR"

# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Make predictions using the best models
for model_name, model in best_models.items():
    print(f"Model: {model_name}")

    # Make predictions
    if model_name == 'TensorFlow':
        y_pred = y_pred_tf  # Use predictions from TensorFlow model
        # Save the TensorFlow model
        model_filename = os.path.join(model_directory, f"{model_name}_model.h5")
        model.save(model_filename)
        #print(f"Model saved as {model_filename}")
    else:
        y_pred = model.predict(X_test)

        # Save the other models using joblib
        model_filename = os.path.join(model_directory, f"{model_name}_model.pkl")
        joblib.dump(model, model_filename)
        #print(f"Model saved as {model_filename}")

    # Calculate and print RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE: {rmse}")

    # Show real and predicted results for the first 5 samples
    #print("Sample predictions:")
   # for i in range(5):
    #    print(f"Sample {i+1}: Real NR = {y_test.iloc[i]}, Predicted NR = {y_pred[i]}")

    print("\n")

Ridge()
Best hyperparameters for Ridge: {'ridge__alpha': 1.0, 'ridge__solver': 'saga'}
DecisionTreeRegressor()
Best hyperparameters for DecisionTreeRegressor: {'decisiontreeregressor__criterion': 'poisson', 'decisiontreeregressor__max_features': 1, 'decisiontreeregressor__min_samples_split': 4, 'decisiontreeregressor__splitter': 'best'}
GradientBoostingRegressor()
Best hyperparameters for GradientBoostingRegressor: {'gradientboostingregressor__learning_rate': 0.1, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__n_estimators': 250, 'gradientboostingregressor__warm_start': False}
RandomForestRegressor()
Best hyperparameters for RandomForestRegressor: {'randomforestregressor__criterion': 'friedman_mse', 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 50}
AdaBoostRegressor()
Best hyperparameters for AdaBoostRegressor: {'adaboostregressor__learning_rate': 1.0, 'adaboostregres



RMSE: 1.2807909634958359


Model: KNeighborsRegressor
RMSE: 2.298001014830678


Model: MLPRegressor
RMSE: 6.117177470889234


Model: ElasticNet
RMSE: 3.549001453184378


Model: SGDRegressor
RMSE: 6.391009139931659


Model: SVR
RMSE: 8.839395178331733


Model: BayesianRidge
RMSE: 6.3694919478125795


Model: KernelRidge
RMSE: 6.412290432125347


Model: LinearRegression
RMSE: 282.72326900379227


Model: RANSACRegressor
RMSE: 8.270247892935153


Model: TheilSenRegressor
RMSE: 181.75746324270537


Model: TensorFlow
RMSE: 15.420255381824443




In [None]:
# After fitting PCA
joblib.dump(pca, 'pca_model.pkl')

# In other code
pca = joblib.load('pca_model.pkl')