In [3]:
import lightgbm as lgb
lgb.__version__

'2.1.2'

In [12]:
import os

# lightgbm的默认线程数是取openmp的默认线程数
os.environ['NUM_OMP_THREADS'] = "4"

import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from contextlib import contextmanager
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import warnings
from scipy.stats import pearsonr
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
warnings.filterwarnings('ignore')

from clf_utils import *

In [2]:
with timer('load train_feat_df'):
    train_feat_df = pd.read_csv('../input/train_feat_df.csv')
with timer('load test_feat_df'):
    test_feat_df = pd.read_csv('../input/test_feat_df.csv')

[load train_feat_df] done in 4.59 seconds
[load test_feat_df] done in 6.83 seconds


In [3]:
train_feat_df = pd.concat([train_feat_df, pd.get_dummies(train_feat_df['设备类型'],prefix='设备类型')],axis=1)
test_feat_df = pd.concat([test_feat_df, pd.get_dummies(test_feat_df['设备类型'],prefix='设备类型')],axis=1)

In [4]:
feature_name1 = ['活塞工作时长','num_samples', '设备类型',
 'min_发动机转速','max_发动机转速','nuni_发动机转速','mean_发动机转速',
 'min_油泵转速','max_油泵转速','nuni_油泵转速','mean_油泵转速',
 'min_泵送压力','max_泵送压力','nuni_泵送压力','mean_泵送压力',
 'min_液压油温','max_液压油温','nuni_液压油温','mean_液压油温',
 'min_流量档位','max_流量档位','nuni_流量档位','mean_流量档位',
 'min_分配压力','max_分配压力','nuni_分配压力','mean_分配压力',
 'min_排量电流','max_排量电流','nuni_排量电流','mean_排量电流',
 '低压开关','反泵']

feature_name2 = ['活塞工作时长','num_samples',
 'min_发动机转速','max_发动机转速','nuni_发动机转速','mean_发动机转速',
 'min_油泵转速','max_油泵转速','nuni_油泵转速','mean_油泵转速',
 'min_泵送压力','max_泵送压力','nuni_泵送压力','mean_泵送压力',
 'min_液压油温','max_液压油温','nuni_液压油温','mean_液压油温',
 'min_流量档位','max_流量档位','nuni_流量档位','mean_流量档位',
 'min_分配压力','max_分配压力','nuni_分配压力','mean_分配压力',
 'min_排量电流','max_排量电流','nuni_排量电流','mean_排量电流',
 '低压开关','正泵',
 '设备类型_0','设备类型_1','设备类型_2','设备类型_3','设备类型_4','设备类型_5','设备类型_6']

In [5]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_lgb', name='lgb1',
                random_state=1997,
                n_folds=5, split_seed=8888,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=1,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=6)
train_prob = np.load('./gotcha_lgb/val.lgb1.npy')
test_prob = np.load('./gotcha_lgb/test.lgb1.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.745392	train's f1: 0.678138	test's auc: 0.694301	test's f1: 0.642197
[400]	train's auc: 0.79314	train's f1: 0.715335	test's auc: 0.704095	test's f1: 0.649248
[600]	train's auc: 0.831638	train's f1: 0.746322	test's auc: 0.711159	test's f1: 0.655829
[800]	train's auc: 0.861211	train's f1: 0.772315	test's auc: 0.716164	test's f1: 0.659746
Early stopping, best iteration is:
[860]	train's auc: 0.869011	train's f1: 0.779974	test's auc: 0.718096	test's f1: 0.663272

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.744784	train's f1: 0.67947	test's auc: 0.691196	test's f1: 0.641727
[400]	train's auc: 0.789737	train's f1: 0.712221	test's auc: 0.702113	test's f1: 0.649248
[600]	train's auc: 0.828455	train's f1: 0.742758	test's auc: 0.710101	test's f1: 0.655437
[800]	train's auc: 0.859414	train's f1: 0.769553	test's auc: 0.715621	test's f1: 0.657866
[1000]	train's auc: 0.883681	

In [6]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_lgb', name='lgb2',
                random_state=2019,
                n_folds=5, split_seed=2333,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=1,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=6)
train_prob = np.load('./gotcha_lgb/val.lgb2.npy')
test_prob = np.load('./gotcha_lgb/test.lgb2.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.746354	train's f1: 0.679647	test's auc: 0.684685	test's f1: 0.637496
[400]	train's auc: 0.794226	train's f1: 0.717215	test's auc: 0.696679	test's f1: 0.648778
[600]	train's auc: 0.831604	train's f1: 0.748203	test's auc: 0.704616	test's f1: 0.6534
[800]	train's auc: 0.863414	train's f1: 0.775958	test's auc: 0.710284	test's f1: 0.656769
[1000]	train's auc: 0.887146	train's f1: 0.798366	test's auc: 0.713764	test's f1: 0.658101
Early stopping, best iteration is:
[1022]	train's auc: 0.889859	train's f1: 0.800854	test's auc: 0.714197	test's f1: 0.659825

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.742059	train's f1: 0.674671	test's auc: 0.69334	test's f1: 0.638671
[400]	train's auc: 0.790368	train's f1: 0.71085	test's auc: 0.704081	test's f1: 0.649013
[600]	train's auc: 0.829436	train's f1: 0.741602	test's auc: 0.712513	test's f1: 0.655594
[800]	train's auc: 0.860639	t

In [7]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_lgb', name='lgb3',
                random_state=2012,
                n_folds=5, split_seed=6666,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=1,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=6)
train_prob = np.load('./gotcha_lgb/val.lgb3.npy')
test_prob = np.load('./gotcha_lgb/test.lgb3.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.744623	train's f1: 0.678178	test's auc: 0.692209	test's f1: 0.637026
[400]	train's auc: 0.791276	train's f1: 0.713984	test's auc: 0.70194	test's f1: 0.645722
[600]	train's auc: 0.830227	train's f1: 0.743756	test's auc: 0.709311	test's f1: 0.651598
[800]	train's auc: 0.859727	train's f1: 0.769573	test's auc: 0.714142	test's f1: 0.657004
[1000]	train's auc: 0.885703	train's f1: 0.79441	test's auc: 0.719602	test's f1: 0.661861
Early stopping, best iteration is:
[1030]	train's auc: 0.888515	train's f1: 0.797485	test's auc: 0.719782	test's f1: 0.662175

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.743329	train's f1: 0.676591	test's auc: 0.703211	test's f1: 0.650893
[400]	train's auc: 0.793137	train's f1: 0.71508	test's auc: 0.71319	test's f1: 0.657004
[600]	train's auc: 0.830945	train's f1: 0.745363	test's auc: 0.720754	test's f1: 0.664917
Early stopping, best iteratio

In [8]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_lgb', name='lgb4',
                random_state=1234,
                min_child_samples=50,
                n_folds=5, split_seed=4321,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=0.9,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=4)
train_prob = np.load('./gotcha_lgb/val.lgb4.npy')
test_prob = np.load('./gotcha_lgb/test.lgb4.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.687872	train's f1: 0.636633	test's auc: 0.670254	test's f1: 0.622924
[400]	train's auc: 0.710225	train's f1: 0.654908	test's auc: 0.679718	test's f1: 0.633814
[600]	train's auc: 0.727835	train's f1: 0.66668	test's auc: 0.685206	test's f1: 0.640081
[800]	train's auc: 0.743572	train's f1: 0.678883	test's auc: 0.689342	test's f1: 0.642824
[1000]	train's auc: 0.758556	train's f1: 0.689068	test's auc: 0.692878	test's f1: 0.643764
[1200]	train's auc: 0.772003	train's f1: 0.699665	test's auc: 0.69567	test's f1: 0.647446
Early stopping, best iteration is:
[1178]	train's auc: 0.770787	train's f1: 0.698627	test's auc: 0.695372	test's f1: 0.648073

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.689531	train's f1: 0.635888	test's auc: 0.670475	test's f1: 0.618066
[400]	train's auc: 0.711953	train's f1: 0.652929	test's auc: 0.677346	test's f1: 0.624412
[600]	train's auc: 0.73025

In [9]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_lgb', name='lgb5',
                random_state=987,
                min_child_samples=50,
                n_folds=5, split_seed=789,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=0.9,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=4)
train_prob = np.load('./gotcha_lgb/val.lgb5.npy')
test_prob = np.load('./gotcha_lgb/test.lgb5.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.688982	train's f1: 0.636006	test's auc: 0.667462	test's f1: 0.617361
[400]	train's auc: 0.710064	train's f1: 0.651597	test's auc: 0.676461	test's f1: 0.626371
[600]	train's auc: 0.727986	train's f1: 0.664662	test's auc: 0.682081	test's f1: 0.631542
[800]	train's auc: 0.743291	train's f1: 0.675239	test's auc: 0.68611	test's f1: 0.633579
[1000]	train's auc: 0.758009	train's f1: 0.686914	test's auc: 0.690574	test's f1: 0.636713
[1200]	train's auc: 0.770569	train's f1: 0.697158	test's auc: 0.692541	test's f1: 0.640865
Early stopping, best iteration is:
[1242]	train's auc: 0.77325	train's f1: 0.698784	test's auc: 0.692991	test's f1: 0.642118

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.687015	train's f1: 0.633009	test's auc: 0.675821	test's f1: 0.62739
[400]	train's auc: 0.7097	train's f1: 0.650971	test's auc: 0.684279	test's f1: 0.634362
[600]	train's auc: 0.728535	t

In [10]:
model = kf_lgbm(x=train_feat_df[feature_name1], y=train_feat_df.label.values, 
                x_test=test_feat_df[feature_name1], 
                output_dir='gotcha_lgb', name='lgb6',
                random_state=2015,
                min_child_samples=50,
                n_folds=5, split_seed=2012,
                learning_rate=0.03,
                colsample_bytree=0.5,
                early_stopping_rounds=100,
                num_leaves=64, 
                min_split_gain=0.9,
                n_estimators=5000,
                eval_metric=metric_micro_f1,
                max_depth=4)
train_prob = np.load('./gotcha_lgb/val.lgb6.npy')
test_prob = np.load('./gotcha_lgb/test.lgb6.npy')
print(f1_score(train_feat_df.label.values,train_prob>0.46,average='micro'))


Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.689101	train's f1: 0.635066	test's auc: 0.666519	test's f1: 0.619477
[400]	train's auc: 0.710989	train's f1: 0.652459	test's auc: 0.676031	test's f1: 0.62739
[600]	train's auc: 0.728115	train's f1: 0.664721	test's auc: 0.681135	test's f1: 0.63303
[800]	train's auc: 0.743569	train's f1: 0.67663	test's auc: 0.685494	test's f1: 0.636243
Early stopping, best iteration is:
[781]	train's auc: 0.74221	train's f1: 0.675533	test's auc: 0.685355	test's f1: 0.637339

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.686506	train's f1: 0.633616	test's auc: 0.67416	test's f1: 0.625823
[400]	train's auc: 0.707825	train's f1: 0.648346	test's auc: 0.681764	test's f1: 0.631855
[600]	train's auc: 0.725883	train's f1: 0.661783	test's auc: 0.686547	test's f1: 0.637496
Early stopping, best iteration is:
[617]	train's auc: 0.727114	train's f1: 0.662723	test's auc: 0.686825	test's f1: 0.6387