In [24]:
import pandas as pd
import ast
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, KernelPCA
from sklearn.random_projection import GaussianRandomProjection
import numpy as np

# Define the data columns and results columns
data_columns = [
    'OF2', 'OF3', 'OF4', 'OF5', 'OF6', 'OF7', 'OF8', 'OF9', 'OF10', 'OF11', 'OF13', 'OF14', 'OF15', 'OF16', 'OF17',
    'OF18', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24', 'OF25', 'OF26', 'OF27', 'OF28', 'OF30', 'OF31',
    'OF33', 'OF34', 'OF37', 'OF38', 'F1', 'F2', 'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F4', 'F5', 'F6',
    'F7', 'F8', 'F9', 'F10', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22', 'F23',
    'F24', 'F25', 'F28', 'F29', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'F36', 'F37', 'F38', 'F39', 'F40',
    'F41', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50', 'F51', 'F52', 'F53', 'F54', 'F55', 'F56', 'F57',
    'F58', 'F59', 'F62', 'F63', 'F64', 'F65', 'F67', 'F68', 'S1', 'S2', 'S4', 'S5'
]

results_columns = ['WS']

# Define a mapping from model names to model classes
model_mapping = {
    'Ridge': Ridge,
    'DecisionTreeRegressor': DecisionTreeRegressor,
    'GradientBoostingRegressor': GradientBoostingRegressor,
    'RandomForestRegressor': RandomForestRegressor,
    'AdaBoostRegressor': AdaBoostRegressor,
    'KNeighborsRegressor': KNeighborsRegressor,
    'MLPRegressor': MLPRegressor,
    'ElasticNet': ElasticNet,
    'SGDRegressor': SGDRegressor,
    'SVR': SVR,
    'BayesianRidge': BayesianRidge,
    'KernelRidge': KernelRidge,
    'LinearRegression': LinearRegression,
    'RANSACRegressor': RANSACRegressor,
    'TheilSenRegressor': TheilSenRegressor
}

# Function to retrain the best models
def retrain_best_models(csv_file, model_name, hyperparameters, reduction_technique, n_components):
    data = pd.read_csv(csv_file)
    X = data[data_columns]
    y = data[results_columns[0]]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Apply reduction technique
    if reduction_technique == 'PCA':
        reducer = PCA(n_components=int(n_components))  # Cast to integer
    elif reduction_technique == 'Random Projection':
        reducer = GaussianRandomProjection(n_components=int(n_components))  # Cast to integer
    # Add other reduction techniques as needed

    # Create a pipeline that includes scaling and dimensionality reduction
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('reducer', reducer)
    ])

    # Fit and transform the data
    X_train_reduced = pipeline.fit_transform(X_train)
    X_test_reduced = pipeline.transform(X_test)

    # Select the model class
    model_class = model_mapping[model_name]

    # Filter hyperparameters for the selected model
    model_hyperparameters = {k.split('__', 1)[1]: v for k, v in hyperparameters.items() if k.startswith(model_name.lower())}

    # Instantiate and train the model
    model = model_class(**model_hyperparameters)
    model.fit(X_train_reduced, y_train)

    # Make predictions
    y_pred = model.predict(X_test_reduced)

    # Calculate RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"Model: {model_name}, Reduction Technique: {reduction_technique}, Components: {n_components}, RMSE: {rmse}")

# Read the best models CSV
best_models_df = pd.read_csv("dimensionality_reduction_rmses_WS.csv")

# Drop rows with NaN values in the 'hyperparameters' column
best_models_df = best_models_df.dropna(subset=['hyperparameters'])

# Retrain the best models
for index, row in best_models_df.iterrows():
    csv_file = row['csv_file']
    model_name = row['model_name']
    hyperparameters = ast.literal_eval(row['hyperparameters'])
    reduction_technique = row['technique']
    n_components = row['n_components']
    retrain_best_models(csv_file, model_name, hyperparameters, reduction_technique, n_components)



Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 2.0, RMSE: 1.408752995804394
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 3.0, RMSE: 1.3553264020616125
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 4.0, RMSE: 1.3882368225963064
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 5.0, RMSE: 1.4382152053036437
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 6.0, RMSE: 1.345787318574354
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 7.0, RMSE: 1.4414923917024591
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 8.0, RMSE: 1.5151029743160391
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 9.0, RMSE: 1.3732042932730233
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 10.0, RMSE: 1.4638506731664875
Model: AdaBoostRegressor, Reduction Technique: PCA, Components: 11.0, RMSE: 1.3451381905524458
Model: AdaBoostRegressor, Reduction Technique: PCA, Componen