In [1]:
import warnings
warnings.simplefilter('ignore')

import gc

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

In [2]:
train = pd.read_csv('data/train_dataset.csv')
# train

In [3]:
test = pd.read_csv('data/evaluation_public.csv')
# test

In [4]:
df = pd.concat([train,test])

In [5]:
roll_cols = ['JS_NH3',
 'CS_NH3',
 'JS_TN',
 'CS_TN',
 'JS_LL',
 'CS_LL',
 'MCCS_NH4',
 'MCCS_NO3',
 'JS_COD',
 'CS_COD',
 'JS_SW',
 'CS_SW',
 'B_HYC_NH4',
 'B_HYC_XD',
 'B_HYC_MLSS',
 'B_HYC_JS_DO',
 'B_HYC_DO',
 'B_CS_MQ_SSLL',
 'B_QY_ORP',
 'N_HYC_NH4',
 'N_HYC_XD',
 'N_HYC_MLSS',
 'N_HYC_JS_DO',
 'N_HYC_DO',
 'N_CS_MQ_SSLL',
 'N_QY_ORP']

df['time'] = pd.to_datetime(df['time'])
for i in range(1,5):
    df[[ii+f'_roll_{i}_mean_diff' for ii in roll_cols]] = df[roll_cols].rolling(i, min_periods=1).sum().diff()
    
df[[ii+'_roll_8_mean' for ii in roll_cols]] = df[roll_cols].rolling(8, min_periods=1).mean()
df[[ii+'_roll_16_mean' for ii in roll_cols]] = df[roll_cols].rolling(16, min_periods=1).mean()

df[[ii+'_roll_16_mean_diff' for ii in roll_cols]] = df[[ii+'_roll_16_mean' for ii in roll_cols]].diff()
df[[ii+'_roll_8_mean_diff' for ii in roll_cols]] = df[[ii+'_roll_8_mean' for ii in roll_cols]].diff()

df[[ii+'_roll_8_std' for ii in roll_cols]] = df[roll_cols].rolling(8, min_periods=1).std()



In [6]:
train = df.iloc[:train.shape[0]]
test = df.iloc[train.shape[0]:]

In [7]:
N_col = ['N_HYC_NH4',
 'N_HYC_XD',
 'N_HYC_MLSS',
 'N_HYC_JS_DO',
 'N_HYC_DO',
 'N_CS_MQ_SSLL',
 'N_QY_ORP']

B_col = ['B_HYC_NH4',
 'B_HYC_XD',
 'B_HYC_MLSS',
 'B_HYC_JS_DO',
 'B_HYC_DO',
 'B_CS_MQ_SSLL',
 'B_QY_ORP']

NB_col = ['A_'+ ii[2:] for ii in ['B_HYC_NH4',
 'B_HYC_XD',
 'B_HYC_MLSS',
 'B_HYC_JS_DO',
 'B_HYC_DO',
 'B_CS_MQ_SSLL',
 'B_QY_ORP']]
train[NB_col] = train[B_col].values/(train[N_col].values+ 1e-3)
test[NB_col] = test[B_col].values/(test[N_col].values+ 1e-3)
# NB_col

In [8]:
# 1. 数据说明里表示，北生化池和南生化池在生产过程中不会互相影响, 可以先试下分开两部分
# 2. 只用有 label 的数据

train_B = train[[i for i in train.columns if (i != 'Label2' and not i.startswith('N_'))]].copy()
train_N = train[[i for i in train.columns if (i != 'Label1' and not i.startswith('B_'))]].copy()

train_B = train_B[train_B['Label1'].notna()].copy().reset_index(drop=True)
train_N = train_N[train_N['Label2'].notna()].copy().reset_index(drop=True)

test_B = test[[i for i in test.columns if not i.startswith('N_')]].copy()
test_N = test[[i for i in test.columns if not i.startswith('B_')]].copy()

In [9]:
# 时间特征
def add_datetime_feats(df):
    df['time'] = pd.to_datetime(df['time'])
    df['day'] = df['time'].dt.day
    df['hour'] = df['time'].dt.hour
    df['dayofweek'] = df['time'].dt.dayofweek
    
    return df

train_B = add_datetime_feats(train_B)
train_N = add_datetime_feats(train_N)
test_B = add_datetime_feats(test_B)
test_N = add_datetime_feats(test_N)

In [10]:
# 做点比率数值特征
def add_ratio_feats(df, type_='B'):
    df['JS_CS_NH3_ratio'] = df['JS_NH3'] / (df['CS_NH3'] + 1e-3)
    df['JS_CS_TN_ratio'] = df['JS_TN'] / (df['CS_TN'] + 1e-3)
    df['JS_CS_LL_ratio']  = df['JS_LL'] / (df['CS_LL'] + 1e-3)
    df['MCCS_NH4_NH3_ratio'] = df['MCCS_NH4'] / (df['CS_NH3'] + 1e-3)
    df['MCCS_NO3_NH3_ratio'] = df['MCCS_NO3'] / (df['CS_NH3'] + 1e-3)
    df['JS_CS_COD_ratio'] = df['JS_COD'] / (df['CS_COD'] + 1e-3)
    df['JS_CS_SW_ratio'] = df['JS_SW'] / (df['CS_SW'] + 1e-3)
    df['HYC_DO_ratio'] = df[f'{type_}_HYC_JS_DO'] / (df[f'{type_}_HYC_DO'] + 1e-3)
    df['CS_MQ_LL_ratio'] = df[f'{type_}_CS_MQ_SSLL'] / (df['CS_LL'] + 1e-3)
    
    return df

train_B = add_ratio_feats(train_B, type_='B')
train_N = add_ratio_feats(train_N, type_='N')
test_B = add_ratio_feats(test_B, type_='B')
test_N = add_ratio_feats(test_N, type_='N')

In [11]:
# target log1p 转换

B_max, B_min = train_B['Label1'].max(), train_B['Label1'].min()
N_max, N_min = train_N['Label2'].max(), train_N['Label2'].min()

train_B['Label1'] = np.log1p(train_B['Label1'])
train_N['Label2'] = np.log1p(train_N['Label2'])

In [12]:
def run_lgb(df_train, df_test, ycol, n_splits=5, seed=2022):
    use_feats = [col for col in df_test.columns if col not in ['time','Label1','Label2','label']]
    model = lgb.LGBMRegressor(num_leaves=32,objective='mape',
                              max_depth=16,
                              learning_rate=0.1,
                              n_estimators=10000,
                              subsample=0.8,
                              feature_fraction=0.6,
                              reg_alpha=0.5,
                              reg_lambda=0.25,
                              random_state=seed,
                              metric=None)
    oof = []
    prediction = df_test[['time']]
    prediction[ycol] = 0
    df_importance_list = []
    from tscv import GapKFold
    cv = GapKFold(n_splits=n_splits, gap_before=0, gap_after=0)
    for fold_id, (trn_idx, val_idx) in enumerate(cv.split(df_train[use_feats])):
        X_train = df_train.iloc[trn_idx][use_feats]
        Y_train = df_train.iloc[trn_idx][ycol]        
        X_val = df_train.iloc[val_idx][use_feats]
        Y_val = df_train.iloc[val_idx][ycol]
        lgb_model = model.fit(X_train,
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=100,
                              eval_metric='rmse',
                              early_stopping_rounds=100)
        pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration_)
        df_oof = df_train.iloc[val_idx][['time', ycol]].copy()
        df_oof['pred'] = pred_val
        oof.append(df_oof)
        pred_test = lgb_model.predict(df_test[use_feats], num_iteration=lgb_model.best_iteration_)
        prediction[ycol] += pred_test / n_splits
        df_importance = pd.DataFrame({
            'column': use_feats,
            'importance': lgb_model.feature_importances_,
        })
        df_importance_list.append(df_importance)
        del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
    df_importance = pd.concat(df_importance_list)
    df_importance = df_importance.groupby(['column'])['importance'].agg(
        'mean').sort_values(ascending=False).reset_index()
    display(df_importance.head(50))
    df_oof = pd.concat(oof).reset_index(drop=True)
    df_oof[ycol] = np.expm1(df_oof[ycol])
    df_oof['pred'] = np.expm1(df_oof['pred'])
    prediction[ycol] = np.expm1(prediction[ycol])
    
    return df_oof, prediction

In [13]:
df_oof_B, pred_B = run_lgb(train_B, test_B, ycol='Label1',n_splits=10)
df_oof_N, pred_N = run_lgb(train_N, test_N, ycol='Label2',n_splits=10)

[100]	train's rmse: 0.0682995	train's mape: 0.00477136	valid's rmse: 0.381781	valid's mape: 0.02531
[100]	train's rmse: 0.0997627	train's mape: 0.00536544	valid's rmse: 0.111005	valid's mape: 0.00902414
[200]	train's rmse: 0.0936389	train's mape: 0.0046146	valid's rmse: 0.108918	valid's mape: 0.00883392
[300]	train's rmse: 0.0911426	train's mape: 0.00426411	valid's rmse: 0.108513	valid's mape: 0.00881604
[100]	train's rmse: 0.100048	train's mape: 0.00532624	valid's rmse: 0.11552	valid's mape: 0.00944425
[200]	train's rmse: 0.0931428	train's mape: 0.00457714	valid's rmse: 0.113447	valid's mape: 0.00923072
[100]	train's rmse: 0.0991928	train's mape: 0.00531579	valid's rmse: 0.1323	valid's mape: 0.0108636
[200]	train's rmse: 0.0921373	train's mape: 0.00455098	valid's rmse: 0.130984	valid's mape: 0.010802
[300]	train's rmse: 0.0895399	train's mape: 0.00418137	valid's rmse: 0.1299	valid's mape: 0.0107338
[400]	train's rmse: 0.0878624	train's mape: 0.00392812	valid's rmse: 0.129883	valid's m

Unnamed: 0,column,importance
0,A_HYC_NH4,181.3
1,B_HYC_DO_roll_16_mean_diff,151.0
2,B_HYC_DO_roll_8_mean_diff,126.2
3,B_HYC_DO_roll_16_mean,123.1
4,day,121.9
5,B_HYC_DO,121.3
6,A_HYC_DO,117.1
7,B_HYC_DO_roll_8_mean,111.6
8,B_HYC_JS_DO_roll_16_mean,109.5
9,B_QY_ORP_roll_16_mean,106.0


[100]	train's rmse: 0.0649399	train's mape: 0.00447779	valid's rmse: 0.326266	valid's mape: 0.0202435
[200]	train's rmse: 0.0589523	train's mape: 0.0038157	valid's rmse: 0.324164	valid's mape: 0.0199591
[300]	train's rmse: 0.056316	train's mape: 0.00350081	valid's rmse: 0.323525	valid's mape: 0.0198733
[400]	train's rmse: 0.0546665	train's mape: 0.0032988	valid's rmse: 0.323402	valid's mape: 0.0198767
[100]	train's rmse: 0.0966226	train's mape: 0.0050093	valid's rmse: 0.0947593	valid's mape: 0.00804917
[200]	train's rmse: 0.0908854	train's mape: 0.00427998	valid's rmse: 0.0952295	valid's mape: 0.00802535
[100]	train's rmse: 0.0961766	train's mape: 0.00496793	valid's rmse: 0.105961	valid's mape: 0.00867458
[200]	train's rmse: 0.0909109	train's mape: 0.00428306	valid's rmse: 0.104525	valid's mape: 0.00856856
[300]	train's rmse: 0.0884061	train's mape: 0.00394001	valid's rmse: 0.103213	valid's mape: 0.00845724
[400]	train's rmse: 0.0871361	train's mape: 0.00372665	valid's rmse: 0.102839	v

Unnamed: 0,column,importance
0,N_HYC_DO_roll_16_mean_diff,206.6
1,N_HYC_NH4_roll_8_mean,200.4
2,A_HYC_DO,185.7
3,N_HYC_NH4_roll_16_mean,184.6
4,N_HYC_DO_roll_8_mean_diff,183.0
5,N_HYC_DO,176.2
6,day,170.5
7,N_HYC_DO_roll_8_mean,160.6
8,N_HYC_DO_roll_16_mean,159.7
9,N_QY_ORP_roll_16_mean,149.1


In [17]:
def calc_score(df1, df2):
    rmse_1 = np.sqrt(mean_squared_error(df1['pred'], (df1['Label1'])))
    rmse_2 = np.sqrt(mean_squared_error(df2['pred'], (df2['Label2'])))
    loss = (rmse_1+rmse_2)/2
    print(rmse_1,rmse_2)
    score = (1 / (1 + loss)) * 1000
    return score

calc_score(df_oof_B, df_oof_N)

3091.5013527627148 2248.255071349608


0.37440868531034793

In [18]:
# 提交
sub = pd.read_csv('data/sample_submission.csv')
sub['Label1'] = pred_B['Label1'].values
sub['Label2'] = pred_N['Label2'].values
sub

Unnamed: 0,time,Label1,Label2
0,2022/7/18 2:40,10277.715094,9309.105213
1,2022/7/18 2:42,10297.708783,9423.129296
2,2022/7/18 2:44,10305.087200,9483.911192
3,2022/7/18 2:46,10392.180776,9332.600185
4,2022/7/18 2:48,10324.405182,9341.154754
...,...,...,...
9995,2022/7/31 23:50,13868.010120,14701.357535
9996,2022/7/31 23:52,13993.966089,14665.620693
9997,2022/7/31 23:54,14279.151838,14728.293917
9998,2022/7/31 23:56,14398.115205,14508.477282


In [19]:
# 0.7814041946461452
# 0.7864133808960185
sub.to_csv('t_base092001.csv', index=False)