In [50]:
import pandas as pd
import numpy as np

In [51]:
data = pd.read_csv("./data/train_dataset.csv")

data['is_train'] = True
evaluation = pd.read_csv("./data/evaluation_public.csv")
evaluation['is_train'] = False
sample = pd.read_csv("./data/sample_submission.csv")

data = pd.concat([data, evaluation]).reset_index(drop=True)

# 特征处理

In [52]:
data['date'] = pd.to_datetime(data['time'])
data['hour'] = data['date'].dt.hour
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['minute'] = data['date'].dt.minute
data['weekday'] = data['date'].dt.weekday
data['day'] = data['date'].dt.day
data['hour'] = data['date'].dt.hour
data['ts'] = data['hour']*3600 + data['minute']*60 + data['date'].dt.second

In [53]:
import gc
features = [ 'JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'CS_LL', 'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL', 'B_QY_ORP', 'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP','weekday','hour', 'ts']
features = [f for f in features if f not in ['time', 'Label1', 'Label2','CS_LL','CS_NH3', 'JS_SW']] #

for f in features:
    data[f] = data[f].fillna(method='ffill')

# 提取特征的diff值

In [54]:
df_list = []
i = 0
add_featuers = []
length = 0
for f in ['JS_NH3', 'CS_TN', 'JS_LL', 'JS_COD', 'CS_COD', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL',
          'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP']:
    for r in [15]:
        train_rolling = data[f].rolling(window=r, center=False)
        f_mean_name = 'rolling{}_{}_mean'.format(r,f)
        f_max_name = 'rolling{}_{}_max'.format(r,f)
        f_min_name = 'rolling{}_{}_min'.format(r,f)
        f_std_name = 'rolling{}_{}_std'.format(r,f)
        f_corr_name = 'rolling{}_{}_corr'.format(r,f)
        f_cov_name = 'rolling{}_{}_cov'.format(r,f)
        f_skew_name = 'rolling{}_{}_skew'.format(r,f)
        f_kurt_name = 'rolling{}_{}_kurt'.format(r,f)
        data[f_mean_name] = train_rolling.mean().fillna(0).values
        data[f_max_name] = train_rolling.max().fillna(0).values
        data[f_min_name] = train_rolling.min().fillna(0).values
        data[f_std_name] = train_rolling.std().fillna(0).values
        data[f_corr_name] = train_rolling.corr().fillna(0).values
        data[f_skew_name] = train_rolling.skew().fillna(0).values

        data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
        data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]
        data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]

        if i == 0:
            add_featuers.append(f_mean_name)
            add_featuers.append(f_max_name)
            add_featuers.append(f_min_name)
            add_featuers.append(f_std_name)
            add_featuers.append(f_corr_name)
            add_featuers.append(f_skew_name)
            add_featuers.append(f'{f}_{f_mean_name}_cha')
            add_featuers.append(f'{f}_{f_max_name}_cha')
            add_featuers.append(f'{f}_{f_min_name}_cha')
features.extend(add_featuers)

# 对所有的特征进行划分
for f in features:
    if f not in ['weekday','hour', 'ts']:
        q = len(data[f].drop_duplicates())
        data[f] = pd.qcut(data[f], q=int(q/10), labels=False, duplicates="drop")

all_train = data[data['is_train']].reset_index(drop=True)
test = data[~data['is_train']].reset_index(drop=True)

  data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]
  data[f_mean_name] = train_rolling.mean().fillna(0).values
  data[f_max_name] = train_rolling.max().fillna(0).values
  data[f_min_name] = train_rolling.min().fillna(0).values
  data[f_std_name] = train_rolling.std().fillna(0).values
  data[f_corr_name] = train_rolling.corr().fillna(0).values
  data[f_skew_name] = train_rolling.skew().fillna(0).values
  data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
  data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]


# 删除分布不均衡的特征

In [55]:
pseudo = all_train[all_train['Label1'].isna()].copy().reset_index(drop=True)
all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
def transform(x: pd.Series, c=20):
    return np.log1p(x/c)

def inverse_transform(x: pd.Series, c = 20):
    return np.expm1(x)*c

all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
all_train['Label1'] = transform(all_train['Label1'])
label_c = 8
all_train['Label2'] = transform(all_train['Label2'], c=label_c)

In [56]:
# inverse_transform(pd.Series([6,6.1]))

# 伪标签进行预测

In [80]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
labels = ['Label1', 'Label2']
score_list = []
test_size = 10000

for label in labels:
    X_train = all_train[features][:-test_size]
    y_train = all_train[label][:-test_size]

    X_test = all_train[features][-test_size:]
    y_test = all_train[label][-test_size:]
    if label == "Label1":
        model = lgb.LGBMRegressor(
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.01,
            n_estimators=10000,
            min_data_in_leaf=5,
            subsample = 0.8,
            feature_fraction=0.3,
            bagging_seed=1,
            reg_alpha=1,
            reg_lambda=1,  # 此处不改了
            min_sum_hessian_in_leaf=1e-8,
            random_state=1212
        )
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)

        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test), inverse_transform(test_pred))))
        print(f"score_list = {score_list}")
        pseudo[label] = model.predict(pseudo[features])
    else:
        model = lgb.LGBMRegressor(
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.02,
            n_estimators=10000,
            min_data_in_leaf=40,
            subsample = 0.4,
            feature_fraction=0.3,
            bagging_seed=11,
            reg_alpha=.1,
            reg_lambda=.1,  # 此处不改了
            min_sum_hessian_in_leaf=1e-8,
            random_state=1589
        )

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)
        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test, c=label_c), inverse_transform(test_pred, c=label_c))))
        print(f"score_list = {score_list}")
        pseudo[label] = model.predict(pseudo[features])
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

Training until validation scores don't improve for 150 rounds
[200]	training's rmse: 0.133487	training's l2: 0.0178188	valid_1's rmse: 0.11992	valid_1's l2: 0.0143807
[400]	training's rmse: 0.108866	training's l2: 0.0118519	valid_1's rmse: 0.116823	valid_1's l2: 0.0136476
Early stopping, best iteration is:
[362]	training's rmse: 0.111394	training's l2: 0.0124087	valid_1's rmse: 0.116735	valid_1's l2: 0.0136271
score_list = [1532.3947407984178]
Training until validation scores don't improve for 150 rounds
[200]	training's rmse: 0.107732	training's l2: 0.0116063	valid_1's rmse: 0.131644	valid_1's l2: 0.0173301
Early stopping, best iteration is:
[206]	training's rmse: 0.107197	training's l2: 0.0114912	valid_1's rmse: 0.131471	valid_1's l2: 0.0172846
score_list = [1532.3947407984178, 1670.9157593161995]
0.6239645113721378


In [58]:
# max(train_pseudo['Label2'])

In [59]:
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

0.6000426182480831


# 对加入伪标签的数据进行训练

In [60]:
# train_pseudo = pd.concat([all_train[:test_size], pseudo])
# train_pseudo = train_pseudo.sample(len(train_pseudo)).reset_index(drop=True)

In [61]:
# import lightgbm as lgb
# from sklearn.metrics import mean_squared_error
# labels = ['Label1', 'Label2']
# score_list = []
# for label in reversed(labels):
#     X_train = train_pseudo[features]
#     y_train = train_pseudo[label]
#
#     X_test = all_train[features][-test_size:]
#     y_test = all_train[label][-test_size:]
#     if label == "Label1":
#         model = lgb.LGBMRegressor(
#             boosting="gbdt",
#             max_depth=6,
#             learning_rate=0.03,
#             n_estimators=10000,
#             min_data_in_leaf=50,
#             subsample = 0.8,
#             feature_fraction=0.3,
#             bagging_seed=1,
#             reg_alpha=1,
#             reg_lambda=1,  # 此处不改了
#             min_sum_hessian_in_leaf=0.1,
#             random_state=1212
#         )
#         model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'],
#               early_stopping_rounds=150, verbose=200)
#         test_pred = model.predict(X_test)
#
#         score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test), inverse_transform(test_pred))))
#         print(f"score_list = {score_list}")
#
#     else:
#         model = lgb.LGBMRegressor(
#             boosting="gbdt",
#             max_depth=4,
#             learning_rate=0.02,
#             n_estimators=10000,
#             min_child_weight=1,
#             min_data_in_leaf=40,
#             subsample = 0.4,
#             feature_fraction=0.3,
#             bagging_seed=11,
#             reg_alpha=1,
#             reg_lambda=1,  # 此处不改了
#             min_sum_hessian_in_leaf=0.01,
#             random_state=222
#         )
#
#         model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'],
#               early_stopping_rounds=150, verbose=200)
#         test_pred = model.predict(X_test)
#         score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test, c=label_c), inverse_transform(test_pred, c=label_c))))
#         print(f"score_list = {score_list}")
#
# loss = np.mean(score_list)
# score = 1000/(1+loss)
# print(score)

Training until validation scores don't improve for 150 rounds
[200]	training's rmse: 0.0528962	training's l2: 0.002798	valid_1's rmse: 0.160047	valid_1's l2: 0.0256152
Early stopping, best iteration is:
[83]	training's rmse: 0.0860057	training's l2: 0.00739699	valid_1's rmse: 0.15818	valid_1's l2: 0.0250208
score_list = [2008.1427877617364]
Training until validation scores don't improve for 300 rounds
[200]	training's rmse: 0.0456254	training's l2: 0.00208168	valid_1's rmse: 0.117215	valid_1's l2: 0.0137392
[400]	training's rmse: 0.0400512	training's l2: 0.0016041	valid_1's rmse: 0.116002	valid_1's l2: 0.0134564
[600]	training's rmse: 0.0371497	training's l2: 0.0013801	valid_1's rmse: 0.115441	valid_1's l2: 0.0133267
[800]	training's rmse: 0.035104	training's l2: 0.00123229	valid_1's rmse: 0.115165	valid_1's l2: 0.0132631
[1000]	training's rmse: 0.0333195	training's l2: 0.00111019	valid_1's rmse: 0.114889	valid_1's l2: 0.0131995
[1200]	training's rmse: 0.0317922	training's l2: 0.001010