In [1]:
import time
import sys
import os
import re
import gc
import datetime
import itertools
import pickle
import random
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
# from sklearn.neighbors import KNeighborsClassifier 
# from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from tqdm import *
from scipy import stats
import warnings
from sklearn.model_selection import KFold, RepeatedKFold,train_test_split,StratifiedKFold
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler ,StandardScaler
from sklearn.metrics import f1_score, log_loss, roc_auc_score, accuracy_score
import seaborn as sns
color = sns.color_palette()
sns.set(style="whitegrid")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
def save_variable(v,filename):
    f=open(filename,'wb')
    pickle.dump(v,f)
    f.close()
    return filename
 
def load_variavle(filename):
    f=open(filename,'rb')
    r=pickle.load(f)
    f.close()
    return r

def static_fe(data1,data2,df,column,values,cc,c):
    addn = df[[column,values]].copy()
    addn = addn.groupby(column)[values].agg(cc).reset_index()
    addn.columns = [column] + [c+values+'_'+i for i in cc]
    data1 = data1.merge(addn,on=column,how='left')
    data2 = data2.merge(addn,on=column,how='left')
    return data1,data2

def cons(x):
    num_times = [(k, len(list(v))) for k, v in itertools.groupby(list(x))]
    num_times = pd.DataFrame(num_times)
    num_times = num_times[num_times[0] == 1][1]
    return num_times.max()

def cons_fe(data,df,column,values):
    kk = df.groupby(column)[values].apply(cons)
    kk = kk.fillna(0).astype('int32').reset_index()
    kk.columns = [column,'cons_' + values]
    data = data.merge(kk, on=column, how='left')
    return data

def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

def auc(y,pred):
#     fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return roc_auc_score(y, pred)

def f1(y,pred):
#     fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return f1_score(y, pred,average='macro')
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
#     y_hat[y_hat>0.45] = 1
#     y_hat[y_hat<=0.45] = 0
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat, average='macro'), True
def lgb_accuracy_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat)
    return 'accuracy', accuracy_score(y_true, y_hat), True

In [3]:
id_map = load_variavle('../data/id_map.pkl')
new_map = {v:k for k, v in id_map.items()}

In [4]:
col = ['活塞工作时长', '发动机转速', '油泵转速', '泵送压力', '液压油温', '流量档位', '分配压力', '排量电流',
       '低压开关', '高压开关', '搅拌超压信号', '正泵', '反泵', '设备类型', 'sample_file_name']

In [5]:
data_all = load_variavle('../data/data_all.pkl')
data_all['活塞工作时长'] = data_all['活塞工作时长'].replace(2098,1)
"""0.628"""
shebeileixing = {7: 573, 6: 44, 5: 78, 4: 63, 3: 9, 2: 4, 1: 252}
data_all['设备类型'] = data_all['设备类型'].map(shebeileixing)

In [6]:
train_data = pd.read_csv('../data/train_labels.csv')
test_data = pd.read_csv('../data/submit_example.csv')
train_data['sample_file_name'] = train_data['sample_file_name'].map(new_map)
test_data['sample_file_name'] = test_data['sample_file_name'].map(new_map)

In [7]:
kk = data_all.groupby(['设备类型','sample_file_name']).size().reset_index().rename(columns={0:'length'})
train_data = train_data.merge(kk, on=['sample_file_name'], how='left')
test_data = test_data.merge(kk, on=['sample_file_name'], how='left')
print(train_data.shape, test_data.shape)

(63817, 4) (52250, 4)


In [8]:
agg_func = {
#     """0.6269"""
            '发动机转速': ['median', 'mean', 'max', 'min', 'var'],
        '油泵转速': ['median', 'mean', 'max', 'min', 'var'],
        '活塞工作时长': ['mean', 'max', 'min'],
        '泵送压力': ['median', 'mean', 'max', 'min', 'var'],
        '液压油温': ['median', 'mean', 'max', 'min', 'var'],
        '流量档位': ['median', 'mean', 'max', 'min', 'var','sum'],
        '分配压力': ['median', 'mean', 'max', 'min', 'var','sum'],
        '排量电流': ['median', 'mean', 'max', 'min', 'var','sum'],
    }

In [9]:
new_columns = get_new_columns('id',agg_func)
df_group = data_all.groupby('sample_file_name').agg(agg_func)
df_group.columns = new_columns
df_group.reset_index(drop=False,inplace=True)
train_data = train_data.merge(df_group, on='sample_file_name', how='left')
test_data = test_data.merge(df_group, on='sample_file_name', how='left')
del df_group;gc.collect()

140

In [10]:
train_df = train_data[train_data['设备类型'] == 573].reset_index(drop=True)
test_df = test_data[test_data['设备类型'] == 573].reset_index(drop=True)
del train_df['设备类型']
del test_df['设备类型']

In [11]:
col = [i for i in train_df.columns if i not in ['sample_file_name', 'label']]
X_train = train_df[col]
y_train = train_df['label'].astype(int)
X_test = test_df[col]
print(X_train.shape, X_test.shape)

(38975, 42) (32020, 42)


In [12]:
scaler = MinMaxScaler()
scaler.fit(X_train.fillna(0))
X_train[X_train.columns] = scaler.transform(X_train.fillna(0))
X_test[X_test.columns] = scaler.transform(X_test.fillna(0)) 

In [13]:
K = 5
seed = 2019#2019
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)
lgb_params = {
                        'boosting_type': 'gbdt',
                        'objective': 'binary',
                        'metric': 'auc',#binary_logloss
                        'num_leaves': 2**7,#2**7+7
                        'subsample': 0.7,#0.7
                        'colsample_bytree': 0.5,#0.5
                        'learning_rate': 0.01,#0.05
                        'seed': 2017,#2017
                        'nthread': 6,
                        'silent': True
             }

In [14]:
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
feature_importance_df = pd.DataFrame()

for i, (train_index, val_index) in enumerate(skf.split(X_train,y_train)):
    print("fold {}".format(i))
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    lgb_train = lgb.Dataset(X_tr,y_tr)
    lgb_val = lgb.Dataset(X_val,y_val)
    num_round = 3000#1250
    clf = lgb.train(lgb_params, lgb_train, num_round, valid_sets = [lgb_train, lgb_val], 
                    feval=lgb_f1_score,
                    verbose_eval=250, early_stopping_rounds = 100)
    oof[val_index] = clf.predict(X_val, num_iteration=clf.best_iteration)
    print('best iteration = ',clf.best_iteration)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = clf.feature_name()
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / skf.n_splits
print('macro f1 score : ', f1_score(y_train, np.round(oof), average='macro'))
#auc: 0.75

fold 0
Training until validation scores don't improve for 100 rounds.
[250]	training's auc: 0.867833	training's f1: 0.791264	valid_1's auc: 0.745934	valid_1's f1: 0.6906
[500]	training's auc: 0.937951	training's f1: 0.854672	valid_1's auc: 0.755642	valid_1's f1: 0.699411
[750]	training's auc: 0.971418	training's f1: 0.901618	valid_1's auc: 0.762198	valid_1's f1: 0.70246
Early stopping, best iteration is:
[712]	training's auc: 0.967804	training's f1: 0.895366	valid_1's auc: 0.761235	valid_1's f1: 0.70357
best iteration =  712
fold 1
Training until validation scores don't improve for 100 rounds.
[250]	training's auc: 0.867803	training's f1: 0.793616	valid_1's auc: 0.742058	valid_1's f1: 0.686322
[500]	training's auc: 0.93724	training's f1: 0.857174	valid_1's auc: 0.756276	valid_1's f1: 0.699623
[750]	training's auc: 0.970481	training's f1: 0.900142	valid_1's auc: 0.764165	valid_1's f1: 0.703964
Early stopping, best iteration is:
[726]	training's auc: 0.968276	training's f1: 0.896835	vali

In [15]:
sub = test_df[['sample_file_name']].copy()
sub['label'] = predictions
sub['sample_file_name'] = sub['sample_file_name'].map(id_map)
sub = sub.sort_values('label', ascending=False).reset_index(drop=True)
print(sub['label'].round().value_counts())
sub['label'] = sub['label'].round().astype(int)
#11924,11629,11581,最新11633

0.0    20387
1.0    11633
Name: label, dtype: int64


In [22]:
sub.to_csv('../sub/houchuli.csv', index=False)

In [16]:
# cc = pd.read_csv('../sub/best.csv')

# dd = sub[sub['label'] == 1]['sample_file_name'].values

# cc.loc[cc['sample_file_name'].isin(dd), 'label'] = 1

# cc.label.value_counts()

# cc.to_csv('../sub/sub_fuck3_23.csv', index=False)