In [11]:
import pandas as pd
import numpy as np

In [12]:
data = pd.read_csv("./data/train_dataset.csv")

data['is_train'] = True
evaluation = pd.read_csv("./data/evaluation_public.csv")
evaluation['is_train'] = False
sample = pd.read_csv("./data/sample_submission.csv")

data = pd.concat([data, evaluation]).reset_index(drop=True)

# 特征处理

In [13]:
data['date'] = pd.to_datetime(data['time'])
data['hour'] = data['date'].dt.hour
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['minute'] = data['date'].dt.minute
data['weekday'] = data['date'].dt.weekday
data['day'] = data['date'].dt.day
data['hour'] = data['date'].dt.hour
data['ts'] = data['hour']*3600 + data['minute']*60 + data['date'].dt.second

In [14]:
import gc
features = [ 'JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'CS_LL', 'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL', 'B_QY_ORP', 'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP','weekday','hour', 'ts']
features = [f for f in features if f not in ['time', 'Label1', 'Label2','CS_LL','CS_NH3', 'JS_SW']] #

for f in features:
    data[f] = data[f].fillna(method='ffill')

# 提取特征的diff值

In [15]:
df_list = []
i = 0
add_featuers = []
length = 0
for f in ['JS_NH3', 'CS_TN', 'JS_LL', 'JS_COD', 'CS_COD', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL',
          'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP']:
    for r in [15]:
        train_rolling = data[f].rolling(window=r, center=False)
        f_mean_name = 'rolling{}_{}_mean'.format(r,f)
        f_max_name = 'rolling{}_{}_max'.format(r,f)
        f_min_name = 'rolling{}_{}_min'.format(r,f)
        f_std_name = 'rolling{}_{}_std'.format(r,f)
        f_corr_name = 'rolling{}_{}_corr'.format(r,f)
        f_cov_name = 'rolling{}_{}_cov'.format(r,f)
        f_skew_name = 'rolling{}_{}_skew'.format(r,f)
        f_kurt_name = 'rolling{}_{}_kurt'.format(r,f)
        data[f_mean_name] = train_rolling.mean().fillna(0).values
        data[f_max_name] = train_rolling.max().fillna(0).values
        data[f_min_name] = train_rolling.min().fillna(0).values
        data[f_std_name] = train_rolling.std().fillna(0).values
        data[f_corr_name] = train_rolling.corr().fillna(0).values
        data[f_skew_name] = train_rolling.skew().fillna(0).values

        data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
        data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]
        data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]

        if i == 0:
            add_featuers.append(f_mean_name)
            add_featuers.append(f_max_name)
            add_featuers.append(f_min_name)
            add_featuers.append(f_std_name)
            add_featuers.append(f_corr_name)
            add_featuers.append(f_skew_name)
            add_featuers.append(f'{f}_{f_mean_name}_cha')
            add_featuers.append(f'{f}_{f_max_name}_cha')
            add_featuers.append(f'{f}_{f_min_name}_cha')
features.extend(add_featuers)

# 对所有的特征进行划分
for f in features:
    if f not in ['weekday','hour', 'ts']:
        q = len(data[f].drop_duplicates())
        data[f] = pd.qcut(data[f], q=int(q/10), labels=False, duplicates="drop")

all_train = data[data['is_train']].reset_index(drop=True)
test = data[~data['is_train']].reset_index(drop=True)

  data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]
  data[f_mean_name] = train_rolling.mean().fillna(0).values
  data[f_max_name] = train_rolling.max().fillna(0).values
  data[f_min_name] = train_rolling.min().fillna(0).values
  data[f_std_name] = train_rolling.std().fillna(0).values
  data[f_corr_name] = train_rolling.corr().fillna(0).values
  data[f_skew_name] = train_rolling.skew().fillna(0).values
  data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
  data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]


# 删除分布不均衡的特征

In [16]:
all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
def transform(x: pd.Series, c=20):
    return np.log1p(x/c)

def inverse_transform(x: pd.Series, c = 20):
    return np.expm1(x)*c

all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
# all_train = all_train[1000:].copy(deep=True).reset_index(drop=True)
all_train['Label1'] = transform(all_train['Label1'])
label_c = 8
all_train['Label2'] = transform(all_train['Label2'], c=label_c)

In [17]:
# inverse_transform(pd.Series([6,6.1]))

# 伪标签进行预测

In [18]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
labels = ['Label1', 'Label2']
score_list = []
test_size = 4000

for label in labels:
    X_train = all_train[features][:-test_size]
    y_train = all_train[label][:-test_size]

    X_test = all_train[features][-test_size:]
    y_test = all_train[label][-test_size:]
    if label == "Label1":
        model = lgb.LGBMRegressor(
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.03,
            n_estimators=10000,
            min_data_in_leaf=5,
            subsample = 0.8,
            feature_fraction=0.3,
            bagging_seed=1,
            reg_alpha=1,
            reg_lambda=1,  # 此处不改了
            min_sum_hessian_in_leaf=1e-8,
            random_state=1212
        )
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)

        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test), inverse_transform(test_pred))))
        print(f"score_list = {score_list}")

        model2 = lgb.LGBMRegressor(
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.03,
            n_estimators=int(1.5*model.best_iteration_),
            min_data_in_leaf=5,
            subsample = 0.8,
            feature_fraction=0.3,
            bagging_seed=1,
            reg_alpha=1,
            reg_lambda=1,  # 此处不改了
            min_sum_hessian_in_leaf=1e-8,
            random_state=1212
        )
        model2.fit(all_train[features], all_train[label])
        test[label] = inverse_transform(model2.predict(test[features]))
    else:
        model = lgb.LGBMRegressor(
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.02,
            n_estimators=10000,
            min_child_weight=1,
            min_data_in_leaf=40,
            subsample = 0.4,
            feature_fraction=0.3,
            bagging_seed=11,
            reg_alpha=1,
            reg_lambda=1,  # 此处不改了
            min_sum_hessian_in_leaf=1e-8,
            random_state=222
        )

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)
        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test, c=label_c), inverse_transform(test_pred, c=label_c))))
        print(f"score_list = {score_list}")

        model2 = lgb.LGBMRegressor(
            boosting="gbdt",
            max_depth=4,
            learning_rate=0.02,
            n_estimators=int(1.5*model.best_iteration_),
            min_data_in_leaf=40,
            subsample = 0.4,
            feature_fraction=0.3,
            bagging_seed=11,
            reg_alpha=1,
            reg_lambda=1,  # 此处不改了
            min_sum_hessian_in_leaf=1e-8,
            random_state=222
        )
        model2.fit(all_train[features], all_train[label])

        test[label] = inverse_transform(model2.predict(test[features]), c=label_c)
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

Training until validation scores don't improve for 150 rounds
[200]	training's rmse: 0.0996878	training's l2: 0.00993766	valid_1's rmse: 0.103755	valid_1's l2: 0.0107651
[400]	training's rmse: 0.0867563	training's l2: 0.00752665	valid_1's rmse: 0.0999986	valid_1's l2: 0.00999973
[600]	training's rmse: 0.0789965	training's l2: 0.00624044	valid_1's rmse: 0.0984652	valid_1's l2: 0.0096954
[800]	training's rmse: 0.0734786	training's l2: 0.00539911	valid_1's rmse: 0.0979401	valid_1's l2: 0.00959227
[1000]	training's rmse: 0.0686689	training's l2: 0.00471542	valid_1's rmse: 0.097558	valid_1's l2: 0.00951756
[1200]	training's rmse: 0.0653416	training's l2: 0.00426952	valid_1's rmse: 0.0973765	valid_1's l2: 0.00948219
Early stopping, best iteration is:
[1115]	training's rmse: 0.0667691	training's l2: 0.00445811	valid_1's rmse: 0.0973566	valid_1's l2: 0.00947831
score_list = [1271.3483094044834]
Training until validation scores don't improve for 150 rounds
[200]	training's rmse: 0.105907	traini

In [19]:
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

0.8508814693129042


In [20]:
test[['time'] + labels].to_csv(f"./res/lightgbm_res.csv", index=False)