In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./data/train_dataset.csv")

data['is_train'] = True
evaluation = pd.read_csv("./data/evaluation_public.csv")
evaluation['is_train'] = False
sample = pd.read_csv("./data/sample_submission.csv")

data = pd.concat([data, evaluation]).reset_index(drop=True)

# 特征处理

In [3]:
data['date'] = pd.to_datetime(data['time'])
data['hour'] = data['date'].dt.hour
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['minute'] = data['date'].dt.minute
data['weekday'] = data['date'].dt.weekday
data['day'] = data['date'].dt.day
data['hour'] = data['date'].dt.hour
data['ts'] = data['hour']*3600 + data['minute']*60 + data['date'].dt.second

In [4]:
import gc
features = [ 'JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'CS_LL', 'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL', 'B_QY_ORP', 'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP','weekday','hour', 'ts']
features = [f for f in features if f not in ['time', 'Label1', 'Label2','CS_LL','CS_NH3', 'JS_SW']] #

for f in features:
    data[f] = data[f].fillna(method='ffill')

# 提取特征的diff值

In [5]:
df_list = []
i = 0
add_featuers = []
length = 0
for f in ['JS_NH3', 'CS_TN', 'JS_LL', 'JS_COD', 'CS_COD', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL',
          'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP']:
    for r in [15]:
        train_rolling = data[f].rolling(window=r, center=False)
        f_mean_name = 'rolling{}_{}_mean'.format(r,f)
        f_max_name = 'rolling{}_{}_max'.format(r,f)
        f_min_name = 'rolling{}_{}_min'.format(r,f)
        f_std_name = 'rolling{}_{}_std'.format(r,f)
        f_corr_name = 'rolling{}_{}_corr'.format(r,f)
        f_cov_name = 'rolling{}_{}_cov'.format(r,f)
        f_skew_name = 'rolling{}_{}_skew'.format(r,f)
        f_kurt_name = 'rolling{}_{}_kurt'.format(r,f)
        data[f_mean_name] = train_rolling.mean().fillna(0).values
        data[f_max_name] = train_rolling.max().fillna(0).values
        data[f_min_name] = train_rolling.min().fillna(0).values
        data[f_std_name] = train_rolling.std().fillna(0).values
        data[f_corr_name] = train_rolling.corr().fillna(0).values
        data[f_skew_name] = train_rolling.skew().fillna(0).values

        data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
        data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]
        data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]

        if i == 0:
            add_featuers.append(f_mean_name)
            add_featuers.append(f_max_name)
            add_featuers.append(f_min_name)
            add_featuers.append(f_std_name)
            add_featuers.append(f_corr_name)
            add_featuers.append(f_skew_name)
            add_featuers.append(f'{f}_{f_mean_name}_cha')
            add_featuers.append(f'{f}_{f_max_name}_cha')
            add_featuers.append(f'{f}_{f_min_name}_cha')
features.extend(add_featuers)

# 对所有的特征进行划分
for f in features:
    if f not in ['weekday','hour', 'ts']:
        q = len(data[f].drop_duplicates())
        data[f] = pd.qcut(data[f], q=int(q/10), labels=False, duplicates="drop")

all_train = data[data['is_train']].reset_index(drop=True)
test = data[~data['is_train']].reset_index(drop=True)

  data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]
  data[f_mean_name] = train_rolling.mean().fillna(0).values
  data[f_max_name] = train_rolling.max().fillna(0).values
  data[f_min_name] = train_rolling.min().fillna(0).values
  data[f_std_name] = train_rolling.std().fillna(0).values
  data[f_corr_name] = train_rolling.corr().fillna(0).values
  data[f_skew_name] = train_rolling.skew().fillna(0).values
  data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
  data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]


# 删除分布不均衡的特征

In [6]:
all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
def transform(x: pd.Series, c=20):
    return np.log1p(x/c)

def inverse_transform(x: pd.Series, c = 20):
    return np.expm1(x)*c

all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
all_train['Label1'] = transform(all_train['Label1'])
label_c = 8
all_train['Label2'] = transform(all_train['Label2'], c=label_c)

In [7]:
# inverse_transform(pd.Series([6,6.1]))

# 伪标签进行预测

In [11]:
import lightgbm as lgb
import catboost as cbt
from sklearn.metrics import mean_squared_error
labels = ['Label1', 'Label2']
score_list = []
test_size = 4000

for label in labels:
    X_train = all_train[features][:-test_size]
    y_train = all_train[label][:-test_size]

    X_test = all_train[features][-test_size:]
    y_test = all_train[label][-test_size:]
    if label == "Label1":
        model = cbt.CatBoostRegressor(
            depth=7,
            learning_rate=0.03,
            l2_leaf_reg=1,
            early_stopping_rounds=100,
            num_trees=50000,
            min_data_in_leaf=30,
            subsample = 0.6,
            colsample_bylevel  = 0.3,
            border_count=512,
            random_seed=1212
        )
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)

        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test), inverse_transform(test_pred))))
        print(f"score_list1 = {score_list}")
        #
        # model2 = cbt.CatBoostRegressor(
        #     depth=7,
        #     learning_rate=0.03,
        #     l2_leaf_reg=1,
        #     early_stopping_rounds=100,
        #     n_estimators=int(1.5*model.best_iteration_),
        #     min_data_in_leaf=30,
        #     subsample = 0.6,
        #     colsample_bylevel=0.3,
        #     border_count=512,
        #     random_seed=1212
        # )
        # model2.fit(all_train[features], all_train[label], verbose=200)
        # test[label] = inverse_transform(model2.predict(test[features]))
    else:
        model = cbt.CatBoostRegressor(
            depth=7,
            learning_rate=0.03,
            l2_leaf_reg=1,
            early_stopping_rounds=100,
            num_trees=50000,
            min_data_in_leaf=5,
            subsample = 0.8,
            colsample_bylevel  = 0.3,
            random_seed=1212
        )

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)
        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test, c=label_c), inverse_transform(test_pred, c=label_c))))
        print(f"score_list2 = {score_list}")
        #
        # model2 = cbt.CatBoostRegressor(
        #     depth=7,
        #     learning_rate=0.03,
        #     l2_leaf_reg=1,
        #     early_stopping_rounds=100,
        #     n_estimators=int(1.5*model.best_iteration_),
        #     min_data_in_leaf=5,
        #     subsample = 0.8,
        #     colsample_bylevel= 0.3,
        #     random_seed=1212
        # )
        # model2.fit(all_train[features], all_train[label], verbose=200)
        # #
        # test[label] = inverse_transform(model2.predict(test[features]), c=label_c)
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

0:	learn: 0.2919759	test: 0.2919759	test1: 0.1764782	best: 0.1764782 (0)	total: 20.6ms	remaining: 17m 9s
200:	learn: 0.0945736	test: 0.0945736	test1: 0.1051370	best: 0.1051270 (199)	total: 5.39s	remaining: 22m 16s
400:	learn: 0.0785842	test: 0.0785842	test1: 0.1020354	best: 0.1020339 (399)	total: 10.9s	remaining: 22m 29s
600:	learn: 0.0685135	test: 0.0685135	test1: 0.1005082	best: 0.1005082 (600)	total: 15.9s	remaining: 21m 43s
800:	learn: 0.0617937	test: 0.0617937	test1: 0.1002724	best: 0.1001541 (775)	total: 20.7s	remaining: 21m 10s
1000:	learn: 0.0569439	test: 0.0569439	test1: 0.0996527	best: 0.0996459 (999)	total: 25.6s	remaining: 20m 51s
1200:	learn: 0.0532525	test: 0.0532525	test1: 0.0993306	best: 0.0993306 (1200)	total: 30.5s	remaining: 20m 37s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.09924444599
bestIteration = 1236

Shrink model to first 1237 iterations.
score_list1 = [1298.1151973645096]
0:	learn: 0.2810596	total: 18.3ms	remaining: 34s
200:	learn: 

In [12]:
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

0.8471957605879503


In [13]:
test[['time'] + labels].to_csv(f"./res/catboost_res.csv", index=False)