In [1]:
import time
import sys
import os
import re
import gc
import datetime
import itertools
import pickle
import random
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from tqdm import *
from utils import *
from scipy import stats
import warnings
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold,train_test_split,StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, f1_score, log_loss,roc_auc_score
import seaborn as sns
color = sns.color_palette()
sns.set(style="whitegrid")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
id_map = load_variavle('../data/id_map.pkl')
new_map = {v:k for k, v in id_map.items()}

In [3]:
col = ['活塞工作时长', '发动机转速', '油泵转速', '泵送压力', '液压油温', '流量档位', '分配压力', '排量电流',
       '低压开关', '高压开关', '搅拌超压信号', '正泵', '反泵', '设备类型', 'sample_file_name']

In [4]:
data_all = load_variavle('../data/data_all.pkl')
data_all['活塞工作时长'] = data_all['活塞工作时长'].replace(2098,1)
"""0.628"""
shebeileixing = {7: 573, 6: 44, 5: 78, 4: 63, 3: 9, 2: 4, 1: 252}
data_all['设备类型'] = data_all['设备类型'].map(shebeileixing)
"""1,0.6288-DCIC_xgb_0.6764"""
data_all['液压油温'] = data_all['液压油温'] + 273.15

In [5]:
train_data = pd.read_csv('../data/train_labels.csv')
test_data = pd.read_csv('../data/submit_example.csv')

In [6]:
train_data['sample_file_name'] = train_data['sample_file_name'].map(new_map)
test_data['sample_file_name'] = test_data['sample_file_name'].map(new_map)

In [7]:
kk = data_all.groupby(['设备类型','sample_file_name']).size().reset_index().rename(columns={0:'length'})
train_data = train_data.merge(kk, on=['sample_file_name'], how='left')
test_data = test_data.merge(kk, on=['sample_file_name'], how='left')

In [8]:
agg_func = {
            '发动机转速': ['median', 'max', 'min', 'std'],
        '油泵转速': ['median', 'max', 'min', 'std'],
        '活塞工作时长': ['median', 'max', 'min'],
        '泵送压力': ['median',  'max', 'min', 'std'],
        '液压油温': ['median',  'max', 'min', 'std'],
        '流量档位': ['median',  'max', 'min', 'std','sum'],
        '分配压力': ['median',  'max', 'min', 'std','sum'],
        '排量电流': ['median',  'max', 'min', 'std','sum'],
    }

In [9]:
new_columns = get_new_columns('id',agg_func)
df_group = data_all.groupby('sample_file_name').agg(agg_func)
df_group.columns = new_columns
df_group.reset_index(drop=False,inplace=True)
train_data = train_data.merge(df_group, on='sample_file_name', how='left')
test_data = test_data.merge(df_group, on='sample_file_name', how='left')
del df_group;gc.collect()

140

In [10]:
col = [i for i in train_data.columns if i not in ['sample_file_name', 'label']]
X_train = train_data[col]
y_train = train_data['label'].astype(int)
X_test = test_data[col]

In [11]:
K = 5
seed = 2021#2021，2019,2025
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)

In [12]:
%%time
K = 5
seed = 2021#2021，2019,2025
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)
xgb_pred_te_all = 0
xgb_auc_mean = 0
xgb_auc_mean2 = 0
f1 = []
oof_xgb = np.zeros(len(X_train))
for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):
    
    y_tr, y_val = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()
    X_tr, X_val= X_train.iloc[train_index,:].copy(), X_train.iloc[test_index,:].copy()
    print( "\nFold ", i)

    xgb_tr = xgb.DMatrix(X_tr, y_tr)
    xgb_val = xgb.DMatrix(X_val, y_val)
    xgb_te = xgb.DMatrix(X_test)
    xgb_params = {"objective": 'binary:logistic',
                  "booster" : "gbtree",
                  "eta": 0.01,
                  "max_depth":9,#9
                  "subsample": 0.7,#0.85
                  'eval_metric':'logloss',#logloss
                  "colsample_bytree": 0.6,#0.7
                  "colsample_bylevel":0.8,#0.8
                  'tree_method':'auto',                                
                  "thread":6,
                  "seed": 666
                  }
    watchlist = [(xgb_tr, 'train'), (xgb_val, 'eval')]
    xgb_model =xgb.train(xgb_params,
                 xgb_tr,
                 num_boost_round = 1126,#1699,1126
                 evals =watchlist,
                 verbose_eval=200,
                 early_stopping_rounds=100)

    pred = xgb_model.predict(xgb_val, ntree_limit=xgb_model.best_ntree_limit)
    oof_xgb[test_index] = xgb_model.predict(xgb_val, ntree_limit=xgb_model.best_ntree_limit)
    f1_s = f1_score(y_val, np.round(pred), average='macro')
    print( " f1_model = ", f1_s )
    f1.append(f1_s)
    print( " auc_model = ", xgb_model.best_score )
    print("*"*100)
    pred_te = xgb_model.predict(xgb_te,ntree_limit=xgb_model.best_ntree_limit)
    xgb_auc_mean2 = xgb_auc_mean2 + xgb_model.best_score / K
    xgb_pred_te_all = xgb_pred_te_all + pred_te / K
    xgb_auc_mean = auc(y_val,pred) / K + xgb_auc_mean
print("="*50+'result'+"="*50)
print( " mean_auc2 = ", xgb_auc_mean2 )
print( " mean_auc = ", xgb_auc_mean )
print( " mean_f1 = ", np.mean(f1) ,np.std(f1))
#0.7517


Fold  0
[0]	train-logloss:0.691999	eval-logloss:0.692257
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 100 rounds.
[200]	train-logloss:0.584676	eval-logloss:0.633295
[400]	train-logloss:0.540455	eval-logloss:0.622723
[600]	train-logloss:0.50679	eval-logloss:0.618016
[800]	train-logloss:0.476969	eval-logloss:0.614041
[1000]	train-logloss:0.44835	eval-logloss:0.610751
[1125]	train-logloss:0.432265	eval-logloss:0.609155
 f1_model =  0.6676835386465393
 auc_model =  0.60913
****************************************************************************************************

Fold  1
[0]	train-logloss:0.691828	eval-logloss:0.692134
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 100 rounds.
[200]	train-logloss:0.586833	eval-logloss:0.628363
[400]	train-logloss:0.543369	eval-logloss:0.615914
[600]	train-logloss:0

In [15]:
sub = test_data[['sample_file_name']].copy()
sub['label'] = xgb_pred_te_all
sub['sample_file_name'] = sub['sample_file_name'].map(id_map)
sub = sub.sort_values('label', ascending=False).reset_index(drop=True)
print(sub['label'].round().value_counts())
#19130

0.0    33120
1.0    19130
Name: label, dtype: int64


In [16]:
sub.loc[:23000,'label'] = 1
sub.loc[23000:,'label'] = 0
sub['label'] = sub['label'].astype(int)

In [18]:
sub.to_csv('../sub/xgb.csv', index=False)

In [19]:
fe_map  = xgb_model.get_fscore()
feature = pd.DataFrame(list(xgb_model.get_fscore()))[0]
values = pd.DataFrame(list(xgb_model.get_fscore()))[0].map(fe_map)
fe_values = pd.DataFrame(feature)
fe_values['value'] = values
fe_values = fe_values.rename(columns={0:'feature'}).sort_values(by = 'value',ascending=False).reset_index(drop = True)

In [20]:
fe_values

Unnamed: 0,feature,value
0,length,9259
1,id_分配压力_sum,8416
2,id_分配压力_median,8322
3,id_分配压力_max,7832
4,id_排量电流_median,7725
5,id_排量电流_max,7603
6,id_泵送压力_min,7520
7,id_发动机转速_min,7379
8,id_液压油温_median,6983
9,id_液压油温_min,6898
