In [2]:
import os

# lightgbm的默认线程数是取openmp的默认线程数
os.environ['NUM_OMP_THREADS'] = "4"

import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from contextlib import contextmanager
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import warnings
from scipy.stats import pearsonr
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',100)

from clf_utils import *

In [3]:
with timer('load train_feat_df'):
    train_feat_df = pd.read_csv('../input/train_feat_df.csv')
with timer('load test_feat_df'):
    test_feat_df = pd.read_csv('../input/test_feat_df.csv')

[load train_feat_df] done in 3.34 seconds
[load test_feat_df] done in 1.14 seconds


In [4]:
train_feat_df = pd.concat([train_feat_df, pd.get_dummies(train_feat_df['设备类型'],prefix='设备类型')],axis=1)
test_feat_df = pd.concat([test_feat_df, pd.get_dummies(test_feat_df['设备类型'],prefix='设备类型')],axis=1)

In [5]:
feat_df = pd.concat([train_feat_df, test_feat_df],axis=0)

In [6]:
tmp = feat_df['活塞工作时长'].astype(str)+'#'\
    +feat_df['设备类型'].astype(str)+'#'\
    +feat_df['低压开关'].astype(str)+'#'\
    +feat_df['正泵'].astype(str)
lbl = LabelEncoder()
feat_df['fake_id'] = lbl.fit_transform(tmp)

In [7]:
train_feat_df = feat_df[:len(train_feat_df)]
test_feat_df = feat_df[len(train_feat_df):]

In [8]:
col = ['发动机转速','油泵转速','泵送压力','液压油温','流量档位','分配压力','排量电流']

feature_common = ['活塞工作时长','num_samples','低压开关','反泵','fake_id'] +\
    [f'min_{col}' for col in col] +\
    [f'max_{col}' for col in col] +\
    [f'nuni_{col}' for col in col] +\
    [f'mean_{col}' for col in col]

feature_name1 = feature_common + ['设备类型']

feature_name2 = feature_common + ['设备类型_0','设备类型_1','设备类型_2',
                                  '设备类型_3','设备类型_4','设备类型_5','设备类型_6']

In [10]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                categorical_feature=['设备类型','fake_id'],
                output_dir='gotcha_fakeid', name='lgb1',
                random_state=1997,
                n_folds=5, split_seed=8888,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=1,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=6)
train_prob = np.load('./gotcha_fakeid/val.lgb1.npy')
test_prob = np.load('./gotcha_fakeid/test.lgb1.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.75183	train's f1: 0.679745	test's auc: 0.697364	test's f1: 0.6411
[400]	train's auc: 0.803411	train's f1: 0.716941	test's auc: 0.712644	test's f1: 0.650266
[600]	train's auc: 0.844637	train's f1: 0.75451	test's auc: 0.723478	test's f1: 0.662253
[800]	train's auc: 0.875455	train's f1: 0.784792	test's auc: 0.731688	test's f1: 0.668442
[1000]	train's auc: 0.898761	train's f1: 0.809042	test's auc: 0.737813	test's f1: 0.67377
[1200]	train's auc: 0.916781	train's f1: 0.828081	test's auc: 0.742408	test's f1: 0.675729
[1400]	train's auc: 0.930619	train's f1: 0.844456	test's auc: 0.74522	test's f1: 0.680351
Early stopping, best iteration is:
[1358]	train's auc: 0.928045	train's f1: 0.8414	test's auc: 0.744255	test's f1: 0.681918

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.752693	train's f1: 0.680744	test's auc: 0.699767	test's f1: 0.645096
[400]	train's auc: 0.803702	tra

In [11]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                categorical_feature=['设备类型','fake_id'],
                output_dir='gotcha_fakeid', name='lgb2',
                random_state=2019,
                n_folds=5, split_seed=2333,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=1,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=6)
train_prob = np.load('./gotcha_fakeid/val.lgb2.npy')
test_prob = np.load('./gotcha_fakeid/test.lgb2.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.75226	train's f1: 0.682408	test's auc: 0.68791	test's f1: 0.636008
[400]	train's auc: 0.803271	train's f1: 0.721074	test's auc: 0.702557	test's f1: 0.647603
[600]	train's auc: 0.843426	train's f1: 0.756723	test's auc: 0.713281	test's f1: 0.654967
[800]	train's auc: 0.875955	train's f1: 0.786575	test's auc: 0.722085	test's f1: 0.66241
[1000]	train's auc: 0.899011	train's f1: 0.811725	test's auc: 0.728197	test's f1: 0.667346
[1200]	train's auc: 0.917473	train's f1: 0.832272	test's auc: 0.733068	test's f1: 0.67189
[1400]	train's auc: 0.931068	train's f1: 0.849039	test's auc: 0.737734	test's f1: 0.674632
Early stopping, best iteration is:
[1402]	train's auc: 0.931283	train's f1: 0.849666	test's auc: 0.737927	test's f1: 0.675572

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.748804	train's f1: 0.677845	test's auc: 0.698018	test's f1: 0.643999
[400]	train's auc: 0.801297

In [12]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                categorical_feature=['设备类型','fake_id'],
                output_dir='gotcha_fakeid', name='lgb3',
                random_state=2012,
                n_folds=5, split_seed=1111,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=1,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=6)
train_prob = np.load('./gotcha_fakeid/val.lgb3.npy')
test_prob = np.load('./gotcha_fakeid/test.lgb3.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.750928	train's f1: 0.680489	test's auc: 0.698283	test's f1: 0.642589
[400]	train's auc: 0.801752	train's f1: 0.716452	test's auc: 0.711947	test's f1: 0.654027
[600]	train's auc: 0.843447	train's f1: 0.752022	test's auc: 0.722197	test's f1: 0.661235
[800]	train's auc: 0.874091	train's f1: 0.781462	test's auc: 0.730078	test's f1: 0.669069
[1000]	train's auc: 0.897998	train's f1: 0.806221	test's auc: 0.737136	test's f1: 0.674005
[1200]	train's auc: 0.916867	train's f1: 0.828551	test's auc: 0.742358	test's f1: 0.680116
[1400]	train's auc: 0.931441	train's f1: 0.845827	test's auc: 0.746646	test's f1: 0.685208
Early stopping, best iteration is:
[1448]	train's auc: 0.934179	train's f1: 0.849568	test's auc: 0.747328	test's f1: 0.686932

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.751451	train's f1: 0.679882	test's auc: 0.695459	test's f1: 0.644469
[400]	train's auc: 0.80

In [13]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_fakeid', name='lgb4',
                categorical_feature=['设备类型','fake_id'],
                random_state=5678,
                min_child_samples=50,
                n_folds=5, split_seed=4321,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=0.9,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=4)
train_prob = np.load('./gotcha_fakeid/val.lgb4.npy')
test_prob = np.load('./gotcha_fakeid/test.lgb4.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.696701	train's f1: 0.638768	test's auc: 0.672726	test's f1: 0.625823
[400]	train's auc: 0.720989	train's f1: 0.656592	test's auc: 0.681679	test's f1: 0.628408
[600]	train's auc: 0.7414	train's f1: 0.67046	test's auc: 0.689506	test's f1: 0.634989
[800]	train's auc: 0.758238	train's f1: 0.683525	test's auc: 0.694793	test's f1: 0.639533
[1000]	train's auc: 0.773282	train's f1: 0.695983	test's auc: 0.698667	test's f1: 0.64345
[1200]	train's auc: 0.787599	train's f1: 0.707167	test's auc: 0.703294	test's f1: 0.646819
[1400]	train's auc: 0.799778	train's f1: 0.718724	test's auc: 0.706811	test's f1: 0.64917
[1600]	train's auc: 0.811142	train's f1: 0.729046	test's auc: 0.709801	test's f1: 0.653479
[1800]	train's auc: 0.821237	train's f1: 0.737626	test's auc: 0.712633	test's f1: 0.655202
Early stopping, best iteration is:
[1709]	train's auc: 0.816409	train's f1: 0.733571	test's auc: 0.711333	test's f1: 0.655907


In [14]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_fakeid', name='lgb5',
                categorical_feature=['设备类型','fake_id'],
                random_state=987,
                min_child_samples=50,
                n_folds=5, split_seed=789,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=0.9,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=4)
train_prob = np.load('./gotcha_fakeid/val.lgb5.npy')
test_prob = np.load('./gotcha_fakeid/test.lgb5.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.695616	train's f1: 0.637592	test's auc: 0.667386	test's f1: 0.618145
[400]	train's auc: 0.718941	train's f1: 0.654535	test's auc: 0.676996	test's f1: 0.622924
[600]	train's auc: 0.740035	train's f1: 0.669167	test's auc: 0.685518	test's f1: 0.629818
[800]	train's auc: 0.757328	train's f1: 0.682506	test's auc: 0.691713	test's f1: 0.635616
[1000]	train's auc: 0.773162	train's f1: 0.694318	test's auc: 0.697493	test's f1: 0.639533
[1200]	train's auc: 0.78688	train's f1: 0.705796	test's auc: 0.701658	test's f1: 0.643685
[1400]	train's auc: 0.799037	train's f1: 0.715727	test's auc: 0.704554	test's f1: 0.645722
[1600]	train's auc: 0.810383	train's f1: 0.725658	test's auc: 0.708231	test's f1: 0.648778
[1800]	train's auc: 0.820427	train's f1: 0.734217	test's auc: 0.710638	test's f1: 0.651207
[2000]	train's auc: 0.830299	train's f1: 0.743326	test's auc: 0.713669	test's f1: 0.653322
[2200]	train's auc: 0.838903	tr

In [15]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_fakeid', name='lgb6',
                categorical_feature=['设备类型','fake_id'],
                random_state=2015,
                min_child_samples=50,
                n_folds=5, split_seed=2012,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=0.9,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=4)
train_prob = np.load('./gotcha_fakeid/val.lgb6.npy')
test_prob = np.load('./gotcha_fakeid/test.lgb6.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.695841	train's f1: 0.638023	test's auc: 0.670376	test's f1: 0.620887
[400]	train's auc: 0.720461	train's f1: 0.656651	test's auc: 0.680185	test's f1: 0.630132
[600]	train's auc: 0.740712	train's f1: 0.670127	test's auc: 0.687699	test's f1: 0.637496
[800]	train's auc: 0.757727	train's f1: 0.683153	test's auc: 0.69337	test's f1: 0.637888
[1000]	train's auc: 0.772758	train's f1: 0.694024	test's auc: 0.698665	test's f1: 0.644782
[1200]	train's auc: 0.786319	train's f1: 0.704542	test's auc: 0.702957	test's f1: 0.647446
Early stopping, best iteration is:
[1275]	train's auc: 0.791142	train's f1: 0.708871	test's auc: 0.704329	test's f1: 0.650266

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.694977	train's f1: 0.63675	test's auc: 0.676891	test's f1: 0.625979
[400]	train's auc: 0.719302	train's f1: 0.653282	test's auc: 0.686343	test's f1: 0.632012
[600]	train's auc: 0.74042