In [21]:
import pandas as pd
import numpy as np

In [22]:
data = pd.read_csv("./data/train_dataset.csv")

data['is_train'] = True
evaluation = pd.read_csv("./data/evaluation_public.csv")
evaluation['is_train'] = False
sample = pd.read_csv("./data/sample_submission.csv")

data = pd.concat([data, evaluation]).reset_index(drop=True)

# 特征处理

In [23]:
data['date'] = pd.to_datetime(data['time'])
data['hour'] = data['date'].dt.hour
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['minute'] = data['date'].dt.minute
data['weekday'] = data['date'].dt.weekday
data['day'] = data['date'].dt.day
data['hour'] = data['date'].dt.hour
# data['ts'] = data['hour']*3600 + data['minute']*60 + data['date'].dt.second
data['ts'] = data['hour']*60 + data['minute']


In [24]:
import gc
features = [ 'JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'CS_LL', 'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL', 'B_QY_ORP', 'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP','weekday','hour', 'ts']

filter_set = {'time', 'Label1', 'Label2','CS_LL','CS_NH3', 'JS_SW', 'B_QY_ORP'}
features = [f for f in features if f not in filter_set] #
for f in features:
    data[f] = data[f].fillna(method='ffill')

# 提取特征的diff值

In [25]:
import warnings
warnings.filterwarnings("ignore")
df_list = []
i = 0
add_featuers = []
length = 0
# JS_TN, MCCS_NO3,MCCS_NH4,'B_QY_ORP'
filter_set = {'weekday', 'hour', 'ts', 'JS_TN', 'CS_SW', 'MCCS_NH4', 'N_HYC_JS_DO', 'MCCS_NO3'}
for f in features:
    if f in filter_set: continue
    for r in [15]:
        train_rolling = data[f].rolling(window=r, center=False)
        f_mean_name = 'rolling{}_{}_mean'.format(r,f)
        f_max_name = 'rolling{}_{}_max'.format(r,f)
        f_min_name = 'rolling{}_{}_min'.format(r,f)
        f_std_name = 'rolling{}_{}_std'.format(r,f)
        f_corr_name = 'rolling{}_{}_corr'.format(r,f)
        f_cov_name = 'rolling{}_{}_cov'.format(r,f)
        f_skew_name = 'rolling{}_{}_skew'.format(r,f)
        f_kurt_name = 'rolling{}_{}_kurt'.format(r,f)
        data[f_mean_name] = train_rolling.mean().fillna(0).values
        data[f_max_name] = train_rolling.max().fillna(0).values
        data[f_min_name] = train_rolling.min().fillna(0).values
        data[f_std_name] = train_rolling.std().fillna(0).values
        data[f_corr_name] = train_rolling.corr().fillna(0).values
        data[f_skew_name] = train_rolling.skew().fillna(0).values

        data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
        data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]
        data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]

        data[f'{f}_{f_min_name}_radio'] = data[f'{f}_{f_mean_name}_cha']/data[f]
        data[f'{f}_{f_max_name}_radio'] = data[f'{f}_{f_max_name}_cha']/data[f]
        data[f'{f}_{f_mean_name}_radio'] = data[f'{f}_{f_min_name}_cha']/data[f]
        data[f'{f}_diff'] = data[f].diff()

        if i == 0:
            add_featuers.append(f_mean_name)
            add_featuers.append(f_max_name)
            add_featuers.append(f_min_name)
            add_featuers.append(f_std_name)
            add_featuers.append(f_corr_name)
            add_featuers.append(f_skew_name)
            add_featuers.append(f'{f}_{f_mean_name}_cha')
            add_featuers.append(f'{f}_{f_max_name}_cha')
            add_featuers.append(f'{f}_{f_min_name}_cha')
            add_featuers.append(f'{f}_diff')

features.extend(add_featuers)


gc.collect()
# 对所有的特征进行划分
for f in features:
    if f not in ['weekday','hour', 'ts']:
        q = len(data[f].drop_duplicates())

        data[f] = pd.qcut(data[f], q=int(q/10), labels=False, duplicates="drop")

all_train = data[data['is_train']].reset_index(drop=True)
test = data[~data['is_train']].reset_index(drop=True)

# 删除分布不均衡的特征

In [26]:
all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
def transform(x: pd.Series, c=20):
    return np.log1p(x/c)

def inverse_transform(x: pd.Series, c = 20):
    return np.expm1(x)*c

all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
all_train['Label1'] = transform(all_train['Label1'])
label_c = 8
all_train['Label2'] = transform(all_train['Label2'], c=label_c)

In [27]:
import matplotlib.pyplot as plt
def plot(y1, y2, x_title, y_title, title):
    plt.figure(figsize=(16,8))
    plt.plot(range(len(y1)), y1)
    plt.plot(range(len(y1)), y2)
    plt.legend()
    plt.xlabel(x_title)
    plt.ylabel(y_title)
    plt.title(title)
    plt.show()

In [33]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
score_list = []
labels = ['Label1','Label2']
test_size = 4000

for label in labels:
    X_train = all_train[features][:-test_size]
    y_train = all_train[label][:-test_size]

    X_test = all_train[features][-test_size:]
    y_test = all_train[label][-test_size:]
    if label == "Label1":
        model = xgb.XGBRegressor(
            max_bin=120,
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.05,
            n_estimators=10000,
            subsample = 0.8,
            colsample_bytree=0.3,
            min_child_weight=0.01,
            bagging_seed=12,
            reg_alpha=1.2,
            reg_lambda=1.2,  # 此处不改了
            gpu_id=0,
            tree_method='gpu_hist',
            random_state=3333
        )
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'],
              early_stopping_rounds=150, verbose=200)

        test_pred = model.predict(X_test)
        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test), inverse_transform(test_pred))))
        print(f"score_list = {score_list}")
        model2 = xgb.XGBRegressor(
            max_bin=120,
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.05,
            n_estimators=int(1.2*model.best_iteration),
            subsample = 0.8,
            colsample_bytree=0.3,
            min_child_weight=0.01,
            bagging_seed=12,
            reg_alpha=1.2,
            reg_lambda=1.2,  # 此处不改了
            gpu_id=0,
            tree_method='gpu_hist',
            random_state=3333
        )
        model2.fit(all_train[features], all_train[label])
        test[label] = inverse_transform(model2.predict(test[features]))
    else:
        model = xgb.XGBRegressor(
            max_bin=100,
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.03,
            n_estimators=10000,
            subsample = 0.7,
            colsample_bytree=0.4,
            min_child_weight=0.01,
            bagging_seed=1,
            reg_alpha=1.,
            reg_lambda=1.,  # 此处不改了
            gpu_id=0,
            tree_method='gpu_hist',
            random_state=1212
        )

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)
        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test, c=label_c), inverse_transform(test_pred, c=label_c))))
        print(f"score_list = {score_list}")
        model2 = xgb.XGBRegressor(
            max_bin=100,
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.03,
            n_estimators=int(1.2*model.best_iteration),
            subsample = 0.7,
            colsample_bytree=0.4,
            min_child_weight=0.01,
            bagging_seed=1,
            reg_alpha=1.,
            reg_lambda=1.,  # 此处不改了
            gpu_id=0,
            tree_method='gpu_hist',
            random_state=1212
        )
        model2.fit(all_train[features], all_train[label])

        test[label] = inverse_transform(model2.predict(test[features]), c=label_c)
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

Parameters: { "bagging_seed", "boosting" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:5.70356	validation_1-rmse:5.64786
[200]	validation_0-rmse:0.09191	validation_1-rmse:0.10653
[400]	validation_0-rmse:0.07775	validation_1-rmse:0.10456
[600]	validation_0-rmse:0.06880	validation_1-rmse:0.10326
[800]	validation_0-rmse:0.06316	validation_1-rmse:0.10263
[1000]	validation_0-rmse:0.05907	validation_1-rmse:0.10200
[1200]	validation_0-rmse:0.05586	validation_1-rmse:0.10193
[1400]	validation_0-rmse:0.05321	validation_1-rmse:0.10186
[1600]	validation_0-rmse:0.05112	validation_1-rmse:0.10186
[1724]	validation_0-rmse:0.04989	validation_1-rmse:0.10194
score_list = [1324.9579942013347]
Parameters: { "bagging_seed", "boosting" } might not

KeyboardInterrupt: 

In [29]:
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

0.6368308776775738


In [30]:
test[['time'] + labels].to_csv(f"./res/xgboost_res.csv", index=False)

In [31]:
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

0.6368308776775738


In [32]:
test[['time'] + labels]

Unnamed: 0,time,Label1,Label2
0,2022/7/18 2:40,9893.540039,9087.713867
1,2022/7/18 2:42,9907.083008,9334.085938
2,2022/7/18 2:44,10312.958984,9408.533203
3,2022/7/18 2:46,10249.574219,9551.273438
4,2022/7/18 2:48,10107.681641,9286.794922
...,...,...,...
9995,2022/7/31 23:50,10742.359375,11269.000977
9996,2022/7/31 23:52,10797.137695,11517.797852
9997,2022/7/31 23:54,10866.143555,11444.873047
9998,2022/7/31 23:56,11019.594727,11510.990234
