In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA, FastICA, KernelPCA, TruncatedSVD
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics import mean_squared_error
from umap import UMAP
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF25',
    'OF26',
    'F3_1',
    'F3_2',
    'F3_3',
    'F3_4',
    'F3_5',
    'F3_6',
    'F3_7',
    'F20',
    'F21',
    'F22',
    'F28',
    'F31',
    'F41',
    'F42',
    'F44',
    'F48',
    'F49',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['WS']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Implement dimensionality reduction techniques
dimensionality_reduction_techniques = {
    "PCA": PCA(n_components=10),
    "t-SNE": TSNE(n_components=2),
    "UMAP": UMAP(n_components=10),
    "Isomap": Isomap(n_components=10),
    "LLE": LocallyLinearEmbedding(n_components=10),
    "Autoencoders": TruncatedSVD(n_components=10),  # Assuming TruncatedSVD as a simple autoencoder
    "ICA": FastICA(n_components=10),
    "Kernel PCA": KernelPCA(n_components=10),
    "Random Projection": GaussianRandomProjection(n_components=10)  # Assuming GaussianRandomProjection
}

# Define the models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(),
    RandomForestRegressor(), AdaBoostRegressor(), KNeighborsRegressor(),
    MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(),
    LinearRegression(), RANSACRegressor(), TheilSenRegressor()
]

# Train and evaluate each model with each dimensionality reduction technique
results = {}
predictions = {}
for name, reducer in dimensionality_reduction_techniques.items():
    results[name] = {}
    predictions[name] = {}
    for model in models:
        # Transform the data
        if name == "LDA":
            X_train_reduced = reducer.fit_transform(X_train_scaled, y_train)
            X_test_reduced = reducer.transform(X_test_scaled)
        elif name == "t-SNE":
            X_concatenated = np.concatenate((X_train_scaled, X_test_scaled), axis=0)
            X_reduced = reducer.fit_transform(X_concatenated)
            X_train_reduced = X_reduced[:len(X_train_scaled)]
            X_test_reduced = X_reduced[len(X_train_scaled):]
        else:
            X_train_reduced = reducer.fit_transform(X_train_scaled)
            X_test_reduced = reducer.transform(X_test_scaled)
        
        # Train the model
        model.fit(X_train_reduced, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_reduced)
        
        # Calculate MSE
        mse = mean_squared_error(y_test, y_pred)
        
        # Store results
        results[name][model.__class__.__name__] = mse
        predictions[name][model.__class__.__name__] = y_pred

# Print results
for name, model_results in results.items():
    print(f"Results using {name}:")
    for model, mse in model_results.items():
        print(f"MSE using {model}: {mse}")

# Show 5 predictions from each reduction algorithm along with the expected value
for name, preds in predictions.items():
    print(f"Predictions using {name}:")
    for model, y_pred in preds.items():
        print(f"Model: {model}")
        for i in range(5):
            print(f"Example {i+1}: Predicted: {y_pred[i]}, Expected: {y_test.values[i]}")


  from .autonotebook import tqdm as notebook_tqdm


Results using PCA:
MSE using Ridge: 3.203280700492372
MSE using DecisionTreeRegressor: 5.974375449873341
MSE using GradientBoostingRegressor: 4.0631247171475575
MSE using RandomForestRegressor: 3.6579407081753836
MSE using AdaBoostRegressor: 3.7552496889077767
MSE using KNeighborsRegressor: 3.0511458662382296
MSE using MLPRegressor: 3.2225993520680096
MSE using ElasticNet: 4.167216730657616
MSE using SGDRegressor: 3.1712959534124785
MSE using SVR: 2.6162487500799423
MSE using BayesianRidge: 3.178747060978093
MSE using KernelRidge: 20.626597424736392
MSE using LinearRegression: 3.2055575824590496
MSE using RANSACRegressor: 3.6072331626032548
MSE using TheilSenRegressor: 3.2845708529591597
Results using t-SNE:
MSE using Ridge: 7.105321451496037
MSE using DecisionTreeRegressor: 5.839443506126851
MSE using GradientBoostingRegressor: 3.7243159833592823
MSE using RandomForestRegressor: 2.663033108859359
MSE using AdaBoostRegressor: 4.032850999340669
MSE using KNeighborsRegressor: 2.930379813

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA, FastICA, KernelPCA, TruncatedSVD
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics import mean_squared_error
from umap import UMAP
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF9',
    'OF10',
    'OF11',
    'OF19',
    'OF20',
    'OF21',
    'OF22',
    'OF23',
    'OF24',
    'F13',
    'F40',
    'F50',
    'F51',
    'F52',
    'F66',
    'S2',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]
results_columns = ['NR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Implement dimensionality reduction techniques
dimensionality_reduction_techniques = {
    "PCA": PCA(n_components=10),
    "t-SNE": TSNE(n_components=2),
    "UMAP": UMAP(n_components=10),
    "Isomap": Isomap(n_components=10),
    "LLE": LocallyLinearEmbedding(n_components=10),
    "Autoencoders": TruncatedSVD(n_components=10),  # Assuming TruncatedSVD as a simple autoencoder
    "ICA": FastICA(n_components=10),
    "Kernel PCA": KernelPCA(n_components=10),
    "Random Projection": GaussianRandomProjection(n_components=10)  # Assuming GaussianRandomProjection
}

# Define the models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(),
    RandomForestRegressor(), AdaBoostRegressor(), KNeighborsRegressor(),
    MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(),
    LinearRegression(), RANSACRegressor(), TheilSenRegressor()
]

# Train and evaluate each model with each dimensionality reduction technique
results = {}
predictions = {}
for name, reducer in dimensionality_reduction_techniques.items():
    results[name] = {}
    predictions[name] = {}
    for model in models:
        # Transform the data
        if name == "LDA":
            X_train_reduced = reducer.fit_transform(X_train_scaled, y_train)
            X_test_reduced = reducer.transform(X_test_scaled)
        elif name == "t-SNE":
            X_concatenated = np.concatenate((X_train_scaled, X_test_scaled), axis=0)
            X_reduced = reducer.fit_transform(X_concatenated)
            X_train_reduced = X_reduced[:len(X_train_scaled)]
            X_test_reduced = X_reduced[len(X_train_scaled):]
        else:
            X_train_reduced = reducer.fit_transform(X_train_scaled)
            X_test_reduced = reducer.transform(X_test_scaled)
        
        # Train the model
        model.fit(X_train_reduced, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_reduced)
        
        # Calculate MSE
        mse = mean_squared_error(y_test, y_pred)
        
        # Store results
        results[name][model.__class__.__name__] = mse
        predictions[name][model.__class__.__name__] = y_pred

# Print results
for name, model_results in results.items():
    print(f"Results using {name}:")
    for model, mse in model_results.items():
        print(f"MSE using {model}: {mse}")

# Show 5 predictions from each reduction algorithm along with the expected value
for name, preds in predictions.items():
    print(f"Predictions using {name}:")
    for model, y_pred in preds.items():
        print(f"Model: {model}")
        for i in range(5):
            print(f"Example {i+1}: Predicted: {y_pred[i]}, Expected: {y_test.values[i]}")




Results using PCA:
MSE using Ridge: 5.894419484038834
MSE using DecisionTreeRegressor: 10.291706199514929
MSE using GradientBoostingRegressor: 7.26525408365257
MSE using RandomForestRegressor: 5.514522107823194
MSE using AdaBoostRegressor: 7.789964976301165
MSE using KNeighborsRegressor: 5.765323474320961
MSE using MLPRegressor: 6.8279068978208
MSE using ElasticNet: 6.810105555239534
MSE using SGDRegressor: 5.903276262286324
MSE using SVR: 5.75569019664632
MSE using BayesianRidge: 6.004181704410028
MSE using KernelRidge: 31.77350138335883
MSE using LinearRegression: 5.894372137158415
MSE using RANSACRegressor: 66.21828401006117
MSE using TheilSenRegressor: 5.584881505249756
Results using t-SNE:
MSE using Ridge: 10.361240821968105
MSE using DecisionTreeRegressor: 15.861778363652826
MSE using GradientBoostingRegressor: 8.149182314575668
MSE using RandomForestRegressor: 5.4191363396598335
MSE using AdaBoostRegressor: 7.9760250344314985
MSE using KNeighborsRegressor: 4.7215911156497805
MSE

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA, FastICA, KernelPCA, TruncatedSVD
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics import mean_squared_error
from umap import UMAP
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF26',
    'OF27',
    'F17',
    'F20',
    'F21',
    'F23',
    'F24',
    'F28',
    'F29',
    'F33',
    'F34',
    'F36',
    'F38',
    'F41',
    'F42',
    'F44',
    'F49',
    'F63',
    'F65',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['PR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Implement dimensionality reduction techniques
dimensionality_reduction_techniques = {
    "PCA": PCA(n_components=10),
    "t-SNE": TSNE(n_components=2),
    "UMAP": UMAP(n_components=10),
    "Isomap": Isomap(n_components=10),
    "LLE": LocallyLinearEmbedding(n_components=10),
    "Autoencoders": TruncatedSVD(n_components=10),  # Assuming TruncatedSVD as a simple autoencoder
    "ICA": FastICA(n_components=10),
    "Kernel PCA": KernelPCA(n_components=10),
    "Random Projection": GaussianRandomProjection(n_components=10)  # Assuming GaussianRandomProjection
}

# Define the models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(),
    RandomForestRegressor(), AdaBoostRegressor(), KNeighborsRegressor(),
    MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(),
    LinearRegression(), RANSACRegressor(), TheilSenRegressor()
]

# Train and evaluate each model with each dimensionality reduction technique
results = {}
predictions = {}
for name, reducer in dimensionality_reduction_techniques.items():
    results[name] = {}
    predictions[name] = {}
    for model in models:
        # Transform the data
        if name == "LDA":
            X_train_reduced = reducer.fit_transform(X_train_scaled, y_train)
            X_test_reduced = reducer.transform(X_test_scaled)
        elif name == "t-SNE":
            X_concatenated = np.concatenate((X_train_scaled, X_test_scaled), axis=0)
            X_reduced = reducer.fit_transform(X_concatenated)
            X_train_reduced = X_reduced[:len(X_train_scaled)]
            X_test_reduced = X_reduced[len(X_train_scaled):]
        else:
            X_train_reduced = reducer.fit_transform(X_train_scaled)
            X_test_reduced = reducer.transform(X_test_scaled)
        
        # Train the model
        model.fit(X_train_reduced, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_reduced)
        
        # Calculate MSE
        mse = mean_squared_error(y_test, y_pred)
        
        # Store results
        results[name][model.__class__.__name__] = mse
        predictions[name][model.__class__.__name__] = y_pred

# Print results
for name, model_results in results.items():
    print(f"Results using {name}:")
    for model, mse in model_results.items():
        print(f"MSE using {model}: {mse}")

# Show 5 predictions from each reduction algorithm along with the expected value
for name, preds in predictions.items():
    print(f"Predictions using {name}:")
    for model, y_pred in preds.items():
        print(f"Model: {model}")
        for i in range(5):
            print(f"Example {i+1}: Predicted: {y_pred[i]}, Expected: {y_test.values[i]}")




Results using PCA:
MSE using Ridge: 2.085976252607282
MSE using DecisionTreeRegressor: 8.819105400137866
MSE using GradientBoostingRegressor: 2.9731114504514236
MSE using RandomForestRegressor: 2.8144431751195644
MSE using AdaBoostRegressor: 2.3005531044674776
MSE using KNeighborsRegressor: 1.3983402672424905
MSE using MLPRegressor: 2.9369449283452065
MSE using ElasticNet: 2.250254814708044
MSE using SGDRegressor: 2.0920811572829825
MSE using SVR: 1.6758684034987401
MSE using BayesianRidge: 2.0704326067789496
MSE using KernelRidge: 43.286239109598206
MSE using LinearRegression: 2.0869981006481124
MSE using RANSACRegressor: 1.775726164602477
MSE using TheilSenRegressor: 1.9860835485091493
Results using t-SNE:
MSE using Ridge: 3.1143187264970065
MSE using DecisionTreeRegressor: 2.137209592808931
MSE using GradientBoostingRegressor: 1.728738305102088
MSE using RandomForestRegressor: 1.4192253222111002
MSE using AdaBoostRegressor: 2.5370439773066926
MSE using KNeighborsRegressor: 1.6013225

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA, FastICA, KernelPCA, TruncatedSVD
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics import mean_squared_error
from umap import UMAP
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, RANSACRegressor, TheilSenRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF26',
    'OF27',
    'F17',
    'F20',
    'F22',
    'F28',
    'F29',
    'F31',
    'F33',
    'F34',
    'F35',
    'F36',
    'F41',
    'F42',
    'F44',
    'F49',
    'S5',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]
results_columns = ['SR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Implement dimensionality reduction techniques
dimensionality_reduction_techniques = {
    "PCA": PCA(n_components=10),
    "t-SNE": TSNE(n_components=2),
    "UMAP": UMAP(n_components=10),
    "Isomap": Isomap(n_components=10),
    "LLE": LocallyLinearEmbedding(n_components=10),
    "Autoencoders": TruncatedSVD(n_components=10),  # Assuming TruncatedSVD as a simple autoencoder
    "ICA": FastICA(n_components=10),
    "Kernel PCA": KernelPCA(n_components=10),
    "Random Projection": GaussianRandomProjection(n_components=10)  # Assuming GaussianRandomProjection
}

# Define the models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(),
    RandomForestRegressor(), AdaBoostRegressor(), KNeighborsRegressor(),
    MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000),
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(),
    LinearRegression(), RANSACRegressor(), TheilSenRegressor()
]

# Train and evaluate each model with each dimensionality reduction technique
results = {}
predictions = {}
for name, reducer in dimensionality_reduction_techniques.items():
    results[name] = {}
    predictions[name] = {}
    for model in models:
        # Transform the data
        if name == "LDA":
            X_train_reduced = reducer.fit_transform(X_train_scaled, y_train)
            X_test_reduced = reducer.transform(X_test_scaled)
        elif name == "t-SNE":
            X_concatenated = np.concatenate((X_train_scaled, X_test_scaled), axis=0)
            X_reduced = reducer.fit_transform(X_concatenated)
            X_train_reduced = X_reduced[:len(X_train_scaled)]
            X_test_reduced = X_reduced[len(X_train_scaled):]
        else:
            X_train_reduced = reducer.fit_transform(X_train_scaled)
            X_test_reduced = reducer.transform(X_test_scaled)
        
        # Train the model
        model.fit(X_train_reduced, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_reduced)
        
        # Calculate MSE
        mse = mean_squared_error(y_test, y_pred)
        
        # Store results
        results[name][model.__class__.__name__] = mse
        predictions[name][model.__class__.__name__] = y_pred

# Print results
for name, model_results in results.items():
    print(f"Results using {name}:")
    for model, mse in model_results.items():
        print(f"MSE using {model}: {mse}")

# Show 5 predictions from each reduction algorithm along with the expected value
for name, preds in predictions.items():
    print(f"Predictions using {name}:")
    for model, y_pred in preds.items():
        print(f"Model: {model}")
        for i in range(5):
            print(f"Example {i+1}: Predicted: {y_pred[i]}, Expected: {y_test.values[i]}")




Results using PCA:
MSE using Ridge: 5195.957423785418
MSE using DecisionTreeRegressor: 10.257559234162507
MSE using GradientBoostingRegressor: 4.555448597523405
MSE using RandomForestRegressor: 3.69867771499331
MSE using AdaBoostRegressor: 3.505702711155886
MSE using KNeighborsRegressor: 4.11799236210327
MSE using MLPRegressor: 70227.58262376025
MSE using ElasticNet: 2682.4608279647273
MSE using SGDRegressor: 5557.353860837162
MSE using SVR: 3.1261856987667964
MSE using BayesianRidge: 5303.183503974189
MSE using KernelRidge: 5113.757484230677
MSE using LinearRegression: 5192.754832105871
MSE using RANSACRegressor: 2537.889093941545
MSE using TheilSenRegressor: 5781.379327730146
Results using t-SNE:
MSE using Ridge: 4.483578084715555
MSE using DecisionTreeRegressor: 8.200248282847113
MSE using GradientBoostingRegressor: 3.455679604782507
MSE using RandomForestRegressor: 3.014227589565993
MSE using AdaBoostRegressor: 5.423698871635058
MSE using KNeighborsRegressor: 4.759617314151596
MSE 