In [27]:
import pandas as pd
import numpy as np

In [28]:
data = pd.read_csv("./data/train_dataset.csv")

data['is_train'] = True
evaluation = pd.read_csv("./data/evaluation_public.csv")
evaluation['is_train'] = False
sample = pd.read_csv("./data/sample_submission.csv")

data = pd.concat([data, evaluation]).reset_index(drop=True)

# 特征处理

In [29]:
data['date'] = pd.to_datetime(data['time'])
data['hour'] = data['date'].dt.hour
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['minute'] = data['date'].dt.minute
data['weekday'] = data['date'].dt.weekday
data['day'] = data['date'].dt.day
data['hour'] = data['date'].dt.hour
data['ts'] = data['hour']*3600 + data['minute']*60 + data['date'].dt.second

In [30]:
import gc
features = [ 'JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'CS_LL', 'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL', 'B_QY_ORP', 'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP','weekday','hour', 'ts']
features = [f for f in features if f not in ['time', 'Label1', 'Label2','CS_LL','CS_NH3', 'JS_SW']] #

for f in features:
    data[f] = data[f].fillna(method='ffill')

# 提取特征的diff值

In [31]:
df_list = []
i = 0
add_featuers = []
length = 0
for f in ['JS_NH3', 'CS_TN', 'JS_LL', 'JS_COD', 'CS_COD', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL',
          'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP']:
    for r in [15]:
        train_rolling = data[f].rolling(window=r, center=False)
        f_mean_name = 'rolling{}_{}_mean'.format(r,f)
        f_max_name = 'rolling{}_{}_max'.format(r,f)
        f_min_name = 'rolling{}_{}_min'.format(r,f)
        f_std_name = 'rolling{}_{}_std'.format(r,f)
        f_corr_name = 'rolling{}_{}_corr'.format(r,f)
        f_cov_name = 'rolling{}_{}_cov'.format(r,f)
        f_skew_name = 'rolling{}_{}_skew'.format(r,f)
        f_kurt_name = 'rolling{}_{}_kurt'.format(r,f)
        data[f_mean_name] = train_rolling.mean().fillna(0).values
        data[f_max_name] = train_rolling.max().fillna(0).values
        data[f_min_name] = train_rolling.min().fillna(0).values
        data[f_std_name] = train_rolling.std().fillna(0).values
        data[f_corr_name] = train_rolling.corr().fillna(0).values
        data[f_skew_name] = train_rolling.skew().fillna(0).values

        data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
        data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]
        data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]
        data[f'{f}_diff'] = data[f].diff()
        if i == 0:
            add_featuers.append(f_mean_name)
            add_featuers.append(f_max_name)
            add_featuers.append(f_min_name)
            add_featuers.append(f_std_name)
            add_featuers.append(f_corr_name)
            add_featuers.append(f_skew_name)
            add_featuers.append(f'{f}_{f_mean_name}_cha')
            add_featuers.append(f'{f}_{f_max_name}_cha')
            add_featuers.append(f'{f}_{f_min_name}_cha')
            add_featuers.append(f'{f}_diff')
features.extend(add_featuers)

# 对所有的特征进行划分
for f in features:
    if f not in ['weekday','hour', 'ts']:
        q = len(data[f].drop_duplicates())
        data[f] = pd.qcut(data[f], q=int(q/10), labels=False, duplicates="drop")

all_train = data[data['is_train']].reset_index(drop=True)
test = data[~data['is_train']].reset_index(drop=True)

  data[f'{f}_diff'] = data[f].diff()
  data[f_mean_name] = train_rolling.mean().fillna(0).values
  data[f_max_name] = train_rolling.max().fillna(0).values
  data[f_min_name] = train_rolling.min().fillna(0).values
  data[f_std_name] = train_rolling.std().fillna(0).values
  data[f_corr_name] = train_rolling.corr().fillna(0).values
  data[f_skew_name] = train_rolling.skew().fillna(0).values
  data[f'{f}_{f_mean_name}_cha'] = data[f] - data[f_mean_name]
  data[f'{f}_{f_max_name}_cha'] = data[f] - data[f_max_name]
  data[f'{f}_{f_min_name}_cha'] = data[f] - data[f_min_name]


# 删除分布不均衡的特征

In [32]:
all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
def transform(x: pd.Series, c=20):
    return np.log1p(x/c)

def inverse_transform(x: pd.Series, c = 20):
    return np.expm1(x)*c

all_train = all_train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
all_train['Label1'] = transform(all_train['Label1'])
label_c = 8
all_train['Label2'] = transform(all_train['Label2'], c=label_c)

In [33]:
# inverse_transform(pd.Series([6,6.1]))

# 伪标签进行预测

In [34]:
import lightgbm as lgb
import catboost as cbt
from sklearn.metrics import mean_squared_error
labels = ['Label1','Label2']
score_list = []
test_size = 4000

for label in labels:
    X_train = all_train[features][:-test_size]
    y_train = all_train[label][:-test_size]

    X_test = all_train[features][-test_size:]
    y_test = all_train[label][-test_size:]
    if label == "Label1":
        model = cbt.CatBoostRegressor(
            depth=7,
            learning_rate=0.03,
            l2_leaf_reg=1,
            early_stopping_rounds=100,
            num_trees=50000,
            min_data_in_leaf=5,
            subsample = 0.5,
            colsample_bylevel  = 0.3,
            border_count=128,
            random_seed=1212
        )
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)

        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test), inverse_transform(test_pred))))
        print(f"score_list1 = {score_list}")
        #
        model2 = cbt.CatBoostRegressor(
            depth=7,
            learning_rate=0.03,
            l2_leaf_reg=1,
            early_stopping_rounds=100,
            num_trees=int(1.2*model.best_iteration_),
            min_data_in_leaf=5,
            subsample = 0.5,
            colsample_bylevel  = 0.3,
            border_count=128,
            random_seed=1212
        )
        model2.fit(all_train[features], all_train[label], verbose=200)
        test[label] = inverse_transform(model2.predict(test[features]))
    else:
        model = cbt.CatBoostRegressor(
            depth=7,
            learning_rate=0.03,
            l2_leaf_reg=1,
            early_stopping_rounds=100,
            num_trees=50000,
            min_data_in_leaf=5,
            subsample = 0.4,
            colsample_bylevel  = 0.2,
            border_count=160,
            random_seed=1212
        )

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
              early_stopping_rounds=150, verbose=200)
        test_pred = model.predict(X_test)
        score_list.append(np.sqrt(mean_squared_error(inverse_transform(y_test, c=label_c), inverse_transform(test_pred, c=label_c))))
        print(f"score_list2 = {score_list}")
        #
        model2 = cbt.CatBoostRegressor(
            depth=7,
            learning_rate=0.03,
            l2_leaf_reg=1,
            early_stopping_rounds=100,
            num_trees=int(1.2*model.best_iteration_),
            min_data_in_leaf=5,
            subsample = 0.4,
            colsample_bylevel  = 0.2,
            border_count=160,
            random_seed=1212
        )
        model2.fit(all_train[features], all_train[label], verbose=200)
        #
        test[label] = inverse_transform(model2.predict(test[features]), c=label_c)
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.2918059	test: 0.2918059	test1: 0.1768811	best: 0.1768811 (0)	total: 9.35ms	remaining: 7m 47s
200:	learn: 0.0945039	test: 0.0945039	test1: 0.1051219	best: 0.1051219 (200)	total: 2.8s	remaining: 11m 33s
400:	learn: 0.0785016	test: 0.0785016	test1: 0.1005100	best: 0.1005100 (400)	total: 5.58s	remaining: 11m 29s
600:	learn: 0.0683097	test: 0.0683097	test1: 0.0986885	best: 0.0986885 (600)	total: 8.3s	remaining: 11m 22s
800:	learn: 0.0615103	test: 0.0615103	test1: 0.0981172	best: 0.0981138 (798)	total: 11s	remaining: 11m 14s
1000:	learn: 0.0565514	test: 0.0565514	test1: 0.0976066	best: 0.0976011 (999)	total: 13.7s	remaining: 11m 9s
1200:	learn: 0.0526072	test: 0.0526072	test1: 0.0973019	best: 0.0971779 (1193)	total: 16.7s	remaining: 11m 19s
1400:	learn: 0.0494637	test: 0.0494637	test1: 0.0971701	best: 0.0971590 (1282)	total: 19.6s	remaining: 11m 18s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.09711914332
bestIteration = 1443

Shrink model to first 1444 it

In [35]:
loss = np.mean(score_list)
score = 1000/(1+loss)
print(score)

0.8603140252290207


In [36]:
test[['time'] + labels].to_csv(f"./res/catboost_res.csv", index=False)