In [56]:
import pandas as pd
import numpy as np

In [57]:
data = pd.read_csv("./data/train_dataset.csv")

data['is_test'] = False
evaluation = pd.read_csv("./data/evaluation_public.csv")
evaluation['is_test'] = True
sample = pd.read_csv("./data/sample_submission.csv")

all_data = pd.concat([data, evaluation]).reset_index(drop=True)

# 特征处理

In [58]:
all_data['date'] = pd.to_datetime(all_data['time'])
all_data['hour'] = all_data['date'].dt.hour
all_data['year'] = all_data['date'].dt.year
all_data['month'] = all_data['date'].dt.month
all_data['minute'] = all_data['date'].dt.minute
all_data['weekday'] = all_data['date'].dt.weekday
all_data['day'] = all_data['date'].dt.day
all_data['hour'] = all_data['date'].dt.hour
all_data['ts'] = all_data['hour']*3600 + all_data['minute']*60 + all_data['date'].dt.second

In [59]:
import gc
features = [ 'JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'CS_LL', 'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL', 'B_QY_ORP', 'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP','weekday','hour', 'ts']

# N_HYC_JS_DO
filter_set = {'time', 'Label1', 'Label2','CS_LL','CS_NH3', 'JS_SW', 'B_QY_ORP'}
features = [f for f in features if f not in filter_set] #
for f in features:
    all_data[f] = all_data[f].fillna(method='ffill')

labels = ['Label1', 'Label2']
train = all_data[~all_data['is_test']].copy(deep=True)
test = all_data[all_data['is_test']].copy(deep=True)
# train = train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)
test['is_train'] = False
train['is_train'] = True
data = pd.concat([train, test]).reset_index(drop=True)

del all_data,train, test
gc.collect()

26

# 通过训练找出在训练集和测试集中差别很大的样本

In [60]:
from sklearn.model_selection import train_test_split
for f in features:
    if f not in ['weekday','hour', 'ts']:
        q = len(data[f].drop_duplicates())
        data[f] = pd.qcut(data[f], q=int(q/500), labels=False, duplicates="drop")
data['is_test'] = data['is_test'].apply(lambda x: 1 if x  else 0)

In [61]:
data.describe()

Unnamed: 0,JS_NH3,CS_NH3,JS_TN,CS_TN,JS_LL,CS_LL,MCCS_NH4,MCCS_NO3,JS_COD,CS_COD,...,Label1,Label2,is_test,hour,year,month,minute,weekday,day,ts
count,150480.0,123432.0,150480.0,150480.0,150480.0,130970.0,150480.0,150480.0,150480.0,0.0,...,35068.0,35068.0,150480.0,150480.0,150480.0,150480.0,150480.0,150480.0,150480.0,150480.0
mean,3.430323,0.041165,2.998212,2.931918,114.172488,1249.096494,1.499415,4.959204,1.998664,,...,13745.474316,11816.071793,0.066454,11.5,2022.0,4.062201,29.0,3.014354,15.856459,43140.0
std,2.230066,0.065458,2.000081,2.080676,64.45957,873.108749,1.117776,3.130447,1.41468,,...,4717.251176,3664.571351,0.249075,6.92221,0.0,1.985831,17.31094,1.993965,8.674275,24941.590446
min,0.0,0.0011,0.0,0.0,0.0,-3447.0269,0.0,0.0,0.0,,...,814.49,719.71,0.0,0.0,2022.0,1.0,0.0,0.0,1.0,0.0
25%,1.0,0.0133,1.0,1.0,64.0,849.8557,0.0,2.0,1.0,,...,10498.4025,9269.4375,0.0,5.75,2022.0,2.0,14.0,1.0,8.0,21570.0
50%,3.0,0.0351,3.0,3.0,106.5,1326.3693,1.0,5.0,2.0,,...,13071.83,11159.96,0.0,11.5,2022.0,4.0,29.0,3.0,16.0,43140.0
75%,5.0,0.0565,5.0,5.0,168.0,1801.7563,2.0,7.0,3.0,,...,15359.6175,13203.8225,0.0,17.25,2022.0,6.0,44.0,5.0,23.0,64710.0
max,7.0,1.4959,6.0,6.0,232.0,3414.167,3.0,10.0,4.0,,...,66326.05,49464.13,1.0,23.0,2022.0,7.0,58.0,6.0,31.0,86280.0


In [65]:
import lightgbm as lgb
use_features = features
filter_set = {'time', 'Label1', 'Label2','CS_LL','CS_NH3', 'JS_SW', 'B_QY_ORP', 'CS_SW',
              'CS_COD'}
use_features = [f for f in use_features if f not in filter_set] #
X_train, X_test, y_train, y_test  = train_test_split(data[use_features], data['is_test'], test_size=0.2)
model = lgb.LGBMClassifier(
        boosting="gbdt",
        max_depth=4,
        learning_rate=0.005,
        n_estimators=500,
        min_child_weight=1,
        min_data_in_leaf=60,
        subsample = 0.7,
        feature_fraction=0.4,
        bagging_seed=1,
        reg_alpha=0.11,
        reg_lambda=0.1,  # 此处不改了
        min_sum_hessian_in_leaf=0.01,
        random_state=1212
    )
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['auc'],
          early_stopping_rounds=20, verbose=200)
df = pd.DataFrame()
df['score'] = model.feature_importances_
df['features']= use_features
df.sort_values(by=['score'], ascending=False, inplace=True, ignore_index=True)

Training until validation scores don't improve for 20 rounds
[200]	training's auc: 0.999313	training's binary_logloss: 0.0917317	valid_1's auc: 0.999286	valid_1's binary_logloss: 0.0913485
[400]	training's auc: 0.99961	training's binary_logloss: 0.0516857	valid_1's auc: 0.999601	valid_1's binary_logloss: 0.0515993
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.999695	training's binary_logloss: 0.0411222	valid_1's auc: 0.999696	valid_1's binary_logloss: 0.0410786


In [66]:
df

Unnamed: 0,score,features
0,609,N_HYC_JS_DO
1,577,B_HYC_JS_DO
2,564,MCCS_NO3
3,542,MCCS_NH4
4,518,N_HYC_XD
5,496,B_HYC_XD
6,439,N_HYC_MLSS
7,369,weekday
8,344,JS_TN
9,310,B_HYC_MLSS


In [64]:
# import lightgbm as lgb
# features_list = [ 'JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW', 'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO', 'B_CS_MQ_SSLL', 'B_QY_ORP', 'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS', 'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL']
#
# feature_score_dict = {}
# for i in range(len(features_list)-1):
#     for j in range(i+1, len(features_list)):
#         features_filter = ['ts', 'weekday', features_list[i], features_list[j]]
#         X_train, X_test, y_train, y_test  = train_test_split(data_v2[features_filter], data_v2['is_test'], test_size=0.2)
#         model = lgb.LGBMClassifier(
#                 boosting="gbdt",
#                 max_depth=4,
#                 learning_rate=0.005,
#                 n_estimators=500,
#                 min_child_weight=1,
#                 min_data_in_leaf=60,
#                 subsample = 0.7,
#                 feature_fraction=0.4,
#                 bagging_seed=1,
#                 reg_alpha=0.11,
#                 reg_lambda=0.1,  # 此处不改了
#                 min_sum_hessian_in_leaf=0.01,
#                 random_state=1212
#             )
#         model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['auc'],
#                   early_stopping_rounds=20, verbose=200)
#         feature_score_dict[f'{features_list[i]}_{features_list[j]}_score'] = model.best_score_['valid_1']['auc']
#
