In [None]:
#Training set ratio is 0.3

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# 读取数据
data = pd.read_excel('EL.xlsx')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

# 初始化存储结果的数组
results = []

# 随机种子
seeds = range(100)

# 模型列表
models = {
    'LinearRegression': LinearRegression(),
    'ResponseSurface': Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ]),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3),
    'GaussianProcess': GaussianProcessRegressor(kernel=C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)), n_restarts_optimizer=10, alpha=0.1),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, random_state=42)
}

# 超参数搜索参数
param_grids = {
    'ResponseSurface': {
        'poly__degree': [2, 3, 4, 5]
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'Lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'KNN': {'n_neighbors': np.arange(1, 100)},
    'RandomForest': {
        'n_estimators': [50, 75, 100],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [3, 4, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]
    },
    'XGBoost': {
        'learning_rate': [0.1],
        'n_estimators': [50, 75, 100],
        'max_depth': [1, 2, 3]
    }
}

# 遍历随机种子
for seed in seeds:
    # 切分数据
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, random_state=seed)
    
    for model_name, model in models.items():
        # 如果模型有超参数搜索
        if model_name in param_grids:
            grid = GridSearchCV(model, param_grids[model_name], cv=10, n_jobs=-1)
            model = grid.fit(x_train, y_train)
            best_params = model.best_params_
        else:
            model.fit(x_train, y_train)
            best_params = {}
        
        # 评估模型
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # 存储结果
        results.append({
            'Seed': seed,
            'Model': model_name,
            'Train_RMSE': train_rmse,
            'Train_R2': train_r2,
            'Test_RMSE': test_rmse,
            'Test_R2': test_r2,
            'Best_Params': best_params
        })

# 转换为DataFrame
results_df = pd.DataFrame(results)

# 保存到Excel
results_df.to_excel('EL_model_evaluation_results_0.3.xlsx', index=False)

# 计算平均值和标准差
summary_df = results_df.groupby('Model').agg({
    'Train_R2': ['mean', 'std'],
    'Test_R2': ['mean', 'std']
}).reset_index()

summary_df.columns = ['Model', 'Train_R2_Mean', 'Train_R2_Std', 'Test_R2_Mean', 'Test_R2_Std']

# 输出平均值和标准差
print(summary_df)

# 可视化结果
plt.figure(dpi=300, figsize=(14, 8))

# 训练集R²
plt.bar(np.arange(len(summary_df)) - 0.2, summary_df['Train_R2_Mean'], yerr=summary_df['Train_R2_Std'], width=0.4, label='Train R²')
# 测试集R²
plt.bar(np.arange(len(summary_df)) + 0.2, summary_df['Test_R2_Mean'], yerr=summary_df['Test_R2_Std'], width=0.4, label='Test R²')

plt.xlabel('Models')
plt.ylabel('R²')
plt.title('Model Evaluation: Train and Test R²')
plt.xticks(ticks=np.arange(len(summary_df)), labels=summary_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Training set ratio is 0.4

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# 读取数据
data = pd.read_excel('EL.xlsx')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

# 初始化存储结果的数组
results = []

# 随机种子
seeds = range(100)

# 模型列表
models = {
    'LinearRegression': LinearRegression(),
    'ResponseSurface': Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ]),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3),
    'GaussianProcess': GaussianProcessRegressor(kernel=C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)), n_restarts_optimizer=10, alpha=0.1),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, random_state=42)
}

# 超参数搜索参数
param_grids = {
    'ResponseSurface': {
        'poly__degree': [2, 3, 4, 5]
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'Lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'KNN': {'n_neighbors': np.arange(1, 100)},
    'RandomForest': {
        'n_estimators': [50, 75, 100],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [3, 4, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]
    },
    'XGBoost': {
        'learning_rate': [0.1],
        'n_estimators': [50, 75, 100],
        'max_depth': [1, 2, 3]
    }
}

# 遍历随机种子
for seed in seeds:
    # 切分数据
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.6, random_state=seed)
    
    for model_name, model in models.items():
        # 如果模型有超参数搜索
        if model_name in param_grids:
            grid = GridSearchCV(model, param_grids[model_name], cv=10, n_jobs=-1)
            model = grid.fit(x_train, y_train)
            best_params = model.best_params_
        else:
            model.fit(x_train, y_train)
            best_params = {}
        
        # 评估模型
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # 存储结果
        results.append({
            'Seed': seed,
            'Model': model_name,
            'Train_RMSE': train_rmse,
            'Train_R2': train_r2,
            'Test_RMSE': test_rmse,
            'Test_R2': test_r2,
            'Best_Params': best_params
        })

# 转换为DataFrame
results_df = pd.DataFrame(results)

# 保存到Excel
results_df.to_excel('EL_model_evaluation_results_0.4.xlsx', index=False)

# 计算平均值和标准差
summary_df = results_df.groupby('Model').agg({
    'Train_R2': ['mean', 'std'],
    'Test_R2': ['mean', 'std']
}).reset_index()

summary_df.columns = ['Model', 'Train_R2_Mean', 'Train_R2_Std', 'Test_R2_Mean', 'Test_R2_Std']

# 输出平均值和标准差
print(summary_df)

# 可视化结果
plt.figure(dpi=300, figsize=(14, 8))

# 训练集R²
plt.bar(np.arange(len(summary_df)) - 0.2, summary_df['Train_R2_Mean'], yerr=summary_df['Train_R2_Std'], width=0.4, label='Train R²')
# 测试集R²
plt.bar(np.arange(len(summary_df)) + 0.2, summary_df['Test_R2_Mean'], yerr=summary_df['Test_R2_Std'], width=0.4, label='Test R²')

plt.xlabel('Models')
plt.ylabel('R²')
plt.title('Model Evaluation: Train and Test R²')
plt.xticks(ticks=np.arange(len(summary_df)), labels=summary_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Training set ratio is 0.5

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# 读取数据
data = pd.read_excel('EL.xlsx')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

# 初始化存储结果的数组
results = []

# 随机种子
seeds = range(100)

# 模型列表
models = {
    'LinearRegression': LinearRegression(),
    'ResponseSurface': Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ]),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3),
    'GaussianProcess': GaussianProcessRegressor(kernel=C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)), n_restarts_optimizer=10, alpha=0.1),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, random_state=42)
}

# 超参数搜索参数
param_grids = {
    'ResponseSurface': {
        'poly__degree': [2, 3, 4, 5]
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'Lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'KNN': {'n_neighbors': np.arange(1, 100)},
    'RandomForest': {
        'n_estimators': [50, 75, 100],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [3, 4, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]
    },
    'XGBoost': {
        'learning_rate': [0.1],
        'n_estimators': [50, 75, 100],
        'max_depth': [1, 2, 3]
    }
}

# 遍历随机种子
for seed in seeds:
    # 切分数据
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=seed)
    
    for model_name, model in models.items():
        # 如果模型有超参数搜索
        if model_name in param_grids:
            grid = GridSearchCV(model, param_grids[model_name], cv=10, n_jobs=-1)
            model = grid.fit(x_train, y_train)
            best_params = model.best_params_
        else:
            model.fit(x_train, y_train)
            best_params = {}
        
        # 评估模型
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # 存储结果
        results.append({
            'Seed': seed,
            'Model': model_name,
            'Train_RMSE': train_rmse,
            'Train_R2': train_r2,
            'Test_RMSE': test_rmse,
            'Test_R2': test_r2,
            'Best_Params': best_params
        })

# 转换为DataFrame
results_df = pd.DataFrame(results)

# 保存到Excel
results_df.to_excel('EL_model_evaluation_results_0.5.xlsx', index=False)

# 计算平均值和标准差
summary_df = results_df.groupby('Model').agg({
    'Train_R2': ['mean', 'std'],
    'Test_R2': ['mean', 'std']
}).reset_index()

summary_df.columns = ['Model', 'Train_R2_Mean', 'Train_R2_Std', 'Test_R2_Mean', 'Test_R2_Std']

# 输出平均值和标准差
print(summary_df)

# 可视化结果
plt.figure(dpi=300, figsize=(14, 8))

# 训练集R²
plt.bar(np.arange(len(summary_df)) - 0.2, summary_df['Train_R2_Mean'], yerr=summary_df['Train_R2_Std'], width=0.4, label='Train R²')
# 测试集R²
plt.bar(np.arange(len(summary_df)) + 0.2, summary_df['Test_R2_Mean'], yerr=summary_df['Test_R2_Std'], width=0.4, label='Test R²')

plt.xlabel('Models')
plt.ylabel('R²')
plt.title('Model Evaluation: Train and Test R²')
plt.xticks(ticks=np.arange(len(summary_df)), labels=summary_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Training set ratio is 0.6

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# 读取数据
data = pd.read_excel('EL.xlsx')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

# 初始化存储结果的数组
results = []

# 随机种子
seeds = range(100)

# 模型列表
models = {
    'LinearRegression': LinearRegression(),
    'ResponseSurface': Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ]),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3),
    'GaussianProcess': GaussianProcessRegressor(kernel=C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)), n_restarts_optimizer=10, alpha=0.1),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, random_state=42)
}

# 超参数搜索参数
param_grids = {
    'ResponseSurface': {
        'poly__degree': [2, 3, 4, 5]
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'Lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'KNN': {'n_neighbors': np.arange(1, 100)},
    'RandomForest': {
        'n_estimators': [50, 75, 100],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [3, 4, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]
    },
    'XGBoost': {
        'learning_rate': [0.1],
        'n_estimators': [50, 75, 100],
        'max_depth': [1, 2, 3]
    }
}

# 遍历随机种子
for seed in seeds:
    # 切分数据
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=seed)
    
    for model_name, model in models.items():
        # 如果模型有超参数搜索
        if model_name in param_grids:
            grid = GridSearchCV(model, param_grids[model_name], cv=10, n_jobs=-1)
            model = grid.fit(x_train, y_train)
            best_params = model.best_params_
        else:
            model.fit(x_train, y_train)
            best_params = {}
        
        # 评估模型
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # 存储结果
        results.append({
            'Seed': seed,
            'Model': model_name,
            'Train_RMSE': train_rmse,
            'Train_R2': train_r2,
            'Test_RMSE': test_rmse,
            'Test_R2': test_r2,
            'Best_Params': best_params
        })

# 转换为DataFrame
results_df = pd.DataFrame(results)

# 保存到Excel
results_df.to_excel('EL_model_evaluation_results_0.6.xlsx', index=False)

# 计算平均值和标准差
summary_df = results_df.groupby('Model').agg({
    'Train_R2': ['mean', 'std'],
    'Test_R2': ['mean', 'std']
}).reset_index()

summary_df.columns = ['Model', 'Train_R2_Mean', 'Train_R2_Std', 'Test_R2_Mean', 'Test_R2_Std']

# 输出平均值和标准差
print(summary_df)

# 可视化结果
plt.figure(dpi=300, figsize=(14, 8))

# 训练集R²
plt.bar(np.arange(len(summary_df)) - 0.2, summary_df['Train_R2_Mean'], yerr=summary_df['Train_R2_Std'], width=0.4, label='Train R²')
# 测试集R²
plt.bar(np.arange(len(summary_df)) + 0.2, summary_df['Test_R2_Mean'], yerr=summary_df['Test_R2_Std'], width=0.4, label='Test R²')

plt.xlabel('Models')
plt.ylabel('R²')
plt.title('Model Evaluation: Train and Test R²')
plt.xticks(ticks=np.arange(len(summary_df)), labels=summary_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Training set ratio is 0.7

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# 读取数据
data = pd.read_excel('EL.xlsx')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

# 初始化存储结果的数组
results = []

# 随机种子
seeds = range(100)

# 模型列表
models = {
    'LinearRegression': LinearRegression(),
    'ResponseSurface': Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ]),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3),
    'GaussianProcess': GaussianProcessRegressor(kernel=C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)), n_restarts_optimizer=10, alpha=0.1),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, random_state=42)
}

# 超参数搜索参数
param_grids = {
    'ResponseSurface': {
        'poly__degree': [2, 3, 4, 5]
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'Lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'KNN': {'n_neighbors': np.arange(1, 100)},
    'RandomForest': {
        'n_estimators': [50, 75, 100],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [3, 4, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]
    },
    'XGBoost': {
        'learning_rate': [0.1],
        'n_estimators': [50, 75, 100],
        'max_depth': [1, 2, 3]
    }
}

# 遍历随机种子
for seed in seeds:
    # 切分数据
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)
    
    for model_name, model in models.items():
        # 如果模型有超参数搜索
        if model_name in param_grids:
            grid = GridSearchCV(model, param_grids[model_name], cv=10, n_jobs=-1)
            model = grid.fit(x_train, y_train)
            best_params = model.best_params_
        else:
            model.fit(x_train, y_train)
            best_params = {}
        
        # 评估模型
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # 存储结果
        results.append({
            'Seed': seed,
            'Model': model_name,
            'Train_RMSE': train_rmse,
            'Train_R2': train_r2,
            'Test_RMSE': test_rmse,
            'Test_R2': test_r2,
            'Best_Params': best_params
        })

# 转换为DataFrame
results_df = pd.DataFrame(results)

# 保存到Excel
results_df.to_excel('EL_model_evaluation_results_0.7.xlsx', index=False)

# 计算平均值和标准差
summary_df = results_df.groupby('Model').agg({
    'Train_R2': ['mean', 'std'],
    'Test_R2': ['mean', 'std']
}).reset_index()

summary_df.columns = ['Model', 'Train_R2_Mean', 'Train_R2_Std', 'Test_R2_Mean', 'Test_R2_Std']

# 输出平均值和标准差
print(summary_df)

# 可视化结果
plt.figure(dpi=300, figsize=(14, 8))

# 训练集R²
plt.bar(np.arange(len(summary_df)) - 0.2, summary_df['Train_R2_Mean'], yerr=summary_df['Train_R2_Std'], width=0.4, label='Train R²')
# 测试集R²
plt.bar(np.arange(len(summary_df)) + 0.2, summary_df['Test_R2_Mean'], yerr=summary_df['Test_R2_Std'], width=0.4, label='Test R²')

plt.xlabel('Models')
plt.ylabel('R²')
plt.title('Model Evaluation: Train and Test R²')
plt.xticks(ticks=np.arange(len(summary_df)), labels=summary_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Training set ratio is 0.8

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# 读取数据
data = pd.read_excel('EL.xlsx')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

# 初始化存储结果的数组
results = []

# 随机种子
seeds = range(100)

# 模型列表
models = {
    'LinearRegression': LinearRegression(),
    'ResponseSurface': Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ]),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3),
    'GaussianProcess': GaussianProcessRegressor(kernel=C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)), n_restarts_optimizer=10, alpha=0.1),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, random_state=42)
}

# 超参数搜索参数
param_grids = {
    'ResponseSurface': {
        'poly__degree': [2, 3, 4, 5]
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'Lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'KNN': {'n_neighbors': np.arange(1, 100)},
    'RandomForest': {
        'n_estimators': [50, 75, 100],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [3, 4, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]
    },
    'XGBoost': {
        'learning_rate': [0.1],
        'n_estimators': [50, 75, 100],
        'max_depth': [1, 2, 3]
    }
}

# 遍历随机种子
for seed in seeds:
    # 切分数据
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)
    
    for model_name, model in models.items():
        # 如果模型有超参数搜索
        if model_name in param_grids:
            grid = GridSearchCV(model, param_grids[model_name], cv=10, n_jobs=-1)
            model = grid.fit(x_train, y_train)
            best_params = model.best_params_
        else:
            model.fit(x_train, y_train)
            best_params = {}
        
        # 评估模型
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # 存储结果
        results.append({
            'Seed': seed,
            'Model': model_name,
            'Train_RMSE': train_rmse,
            'Train_R2': train_r2,
            'Test_RMSE': test_rmse,
            'Test_R2': test_r2,
            'Best_Params': best_params
        })

# 转换为DataFrame
results_df = pd.DataFrame(results)

# 保存到Excel
results_df.to_excel('EL_model_evaluation_results_0.8.xlsx', index=False)

# 计算平均值和标准差
summary_df = results_df.groupby('Model').agg({
    'Train_R2': ['mean', 'std'],
    'Test_R2': ['mean', 'std']
}).reset_index()

summary_df.columns = ['Model', 'Train_R2_Mean', 'Train_R2_Std', 'Test_R2_Mean', 'Test_R2_Std']

# 输出平均值和标准差
print(summary_df)

# 可视化结果
plt.figure(dpi=300, figsize=(14, 8))

# 训练集R²
plt.bar(np.arange(len(summary_df)) - 0.2, summary_df['Train_R2_Mean'], yerr=summary_df['Train_R2_Std'], width=0.4, label='Train R²')
# 测试集R²
plt.bar(np.arange(len(summary_df)) + 0.2, summary_df['Test_R2_Mean'], yerr=summary_df['Test_R2_Std'], width=0.4, label='Test R²')

plt.xlabel('Models')
plt.ylabel('R²')
plt.title('Model Evaluation: Train and Test R²')
plt.xticks(ticks=np.arange(len(summary_df)), labels=summary_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Training set ratio is 0.9

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# 读取数据
data = pd.read_excel('EL.xlsx')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

# 初始化存储结果的数组
results = []

# 随机种子
seeds = range(100)

# 模型列表
models = {
    'LinearRegression': LinearRegression(),
    'ResponseSurface': Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ]),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3),
    'GaussianProcess': GaussianProcessRegressor(kernel=C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)), n_restarts_optimizer=10, alpha=0.1),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, random_state=42)
}

# 超参数搜索参数
param_grids = {
    'ResponseSurface': {
        'poly__degree': [2, 3, 4, 5]
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'Lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'KNN': {'n_neighbors': np.arange(1, 100)},
    'RandomForest': {
        'n_estimators': [50, 75, 100],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [3, 4, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]
    },
    'XGBoost': {
        'learning_rate': [0.1],
        'n_estimators': [50, 75, 100],
        'max_depth': [1, 2, 3]
    }
}

# 遍历随机种子
for seed in seeds:
    # 切分数据
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=seed)
    
    for model_name, model in models.items():
        # 如果模型有超参数搜索
        if model_name in param_grids:
            grid = GridSearchCV(model, param_grids[model_name], cv=10, n_jobs=-1)
            model = grid.fit(x_train, y_train)
            best_params = model.best_params_
        else:
            model.fit(x_train, y_train)
            best_params = {}
        
        # 评估模型
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # 存储结果
        results.append({
            'Seed': seed,
            'Model': model_name,
            'Train_RMSE': train_rmse,
            'Train_R2': train_r2,
            'Test_RMSE': test_rmse,
            'Test_R2': test_r2,
            'Best_Params': best_params
        })

# 转换为DataFrame
results_df = pd.DataFrame(results)

# 保存到Excel
results_df.to_excel('EL_model_evaluation_results_0.9.xlsx', index=False)

# 计算平均值和标准差
summary_df = results_df.groupby('Model').agg({
    'Train_R2': ['mean', 'std'],
    'Test_R2': ['mean', 'std']
}).reset_index()

summary_df.columns = ['Model', 'Train_R2_Mean', 'Train_R2_Std', 'Test_R2_Mean', 'Test_R2_Std']

# 输出平均值和标准差
print(summary_df)

# 可视化结果
plt.figure(dpi=300, figsize=(14, 8))

# 训练集R²
plt.bar(np.arange(len(summary_df)) - 0.2, summary_df['Train_R2_Mean'], yerr=summary_df['Train_R2_Std'], width=0.4, label='Train R²')
# 测试集R²
plt.bar(np.arange(len(summary_df)) + 0.2, summary_df['Test_R2_Mean'], yerr=summary_df['Test_R2_Std'], width=0.4, label='Test R²')

plt.xlabel('Models')
plt.ylabel('R²')
plt.title('Model Evaluation: Train and Test R²')
plt.xticks(ticks=np.arange(len(summary_df)), labels=summary_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
