- 我们生成只在一个区间相关的数据集（且不是线性相关）

In [21]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sko.DE import DE
random_head = pd.DataFrame(np.random.random([1000, 2]) * [1, 1000**3])
random_tail = pd.DataFrame(np.random.random([1000, 2]) * [1, 1000**3] + np.array((1000, 0)))
regular_mid = pd.DataFrame([[i, i**3] for i in range(1000)])
sample = pd.concat((random_head, regular_mid, random_tail))
sample.index = sample[0]

- 排序和找`x`最大最小值，方便后续处理
- 这里最值要在`DE`的`ub, lb`使用

In [22]:
sample = sample.dropna().sort_index()
min_x, max_x = sample.values[:, 0].min(), sample.values[:, 0].max()

- 构造待优化目标
  - 这里输入
    - 数据集`sample`
    - 这次运行时认为“相关”的区间的上下界`low, high`
    - 幂次`alpha`
  - 只考查这一认为“相关”的区间，进行拟合
  - 除了该区间，其余部分直接平凡地使得$\hat y = \hat b x + \hat a$成立显然是合理做法
  - 注意特判`inside`或`outside`无数据的情况

In [25]:
def get_info_from_hyperparams(sample, low, high, alpha):
    full_array = sample.values
    
    inside_range_mask = np.logical_and(full_array[:, 0] > low, full_array[:, 0] < high)
    if inside_range_mask.sum() == 0:
        return 0, 0, np.nan, np.nan
    inside_part = full_array[np.where(inside_range_mask)[0]]
    inside_x, inside_y = np.power(inside_part[:, 0], alpha).reshape(-1, 1), inside_part[:, 1]
    model = LinearRegression()
    model.fit(inside_x, inside_y)
    local_score = model.score(inside_x, inside_y)
    b, a = model.coef_, model.intercept_

    processed_sample = inside_range_mask * np.power(full_array[:, 0], alpha)

    outside_range_mask = 1 - inside_range_mask
    if outside_range_mask.sum() != 0:
        outside_y_mean = full_array[:, 1][np.where(outside_range_mask)[0]].mean()
        processed_sample += outside_range_mask * (outside_y_mean - a) / b

    x_after_mask, y_after_mask = processed_sample.reshape(-1, 1), full_array[:, 1]
    assert x_after_mask.mean() * b + a - y_after_mask.mean() < 1e-3
    global_score = model.score(x_after_mask, y_after_mask)
    return local_score, global_score, b, a

print(get_info_from_hyperparams(sample, 0, 2000, 3))
print(get_info_from_hyperparams(sample, 2, 999, 1.2))
print(get_info_from_hyperparams(sample, 2, 999, 3))

(0.0876056988017686, 0.08815998180886686, array([0.1995945]), 331974203.0372608)
(0.8807328168235553, 0.39046208211956335, array([225738.61937446]), -159277903.83746284)
(1.0, 0.42382317700424943, array([1.]), 2.086162567138672e-07)


- 根据接口要求，定义由三元组到结果（且越小越好）的待优化目标，并进行优化
- 这里当出现更好结果时每次进行输出，避免你傻等啥也看不到

In [None]:
min_z = float('inf')
def objective(p):
    low, high, alpha = p
    z = -get_info_from_hyperparams(sample, low, high, alpha)[1]
    global min_z
    if z < min_z:
        min_z = z
        print(min_z)
    return z

de = DE(func=objective, n_dim=3, size_pop=50, max_iter=100, lb=[min_x, min_x, 0.01], ub=[max_x, max_x, 10])
best_x, best_y = de.run()
print(best_x, best_y)