In [1]:
import time
import sys
import os
import re
import gc
import datetime
import itertools
import pickle
import random
import numpy as np 
import pandas as pd 
# import lightgbm as lgb
# import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from tqdm import *
from scipy import stats
import warnings
from sklearn.model_selection import KFold, RepeatedKFold,train_test_split,StratifiedKFold
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler ,StandardScaler
from sklearn.metrics import f1_score, log_loss, roc_auc_score, accuracy_score
import seaborn as sns
color = sns.color_palette()
sns.set(style="whitegrid")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [2]:
def save_variable(v,filename):
    f=open(filename,'wb')
    pickle.dump(v,f)
    f.close()
    return filename
 
def load_variavle(filename):
    f=open(filename,'rb')
    r=pickle.load(f)
    f.close()
    return r

def static_fe(data1,data2,df,column,values,cc,c):
    addn = df[[column,values]].copy()
    addn = addn.groupby(column)[values].agg(cc).reset_index()
    addn.columns = [column] + [c+values+'_'+i for i in cc]
    data1 = data1.merge(addn,on=column,how='left')
    data2 = data2.merge(addn,on=column,how='left')
    return data1,data2

def cons(x):
    num_times = [(k, len(list(v))) for k, v in itertools.groupby(list(x))]
    num_times = pd.DataFrame(num_times)
    num_times = num_times[num_times[0] == 1][1]
    return num_times.max()

def cons_fe(data,df,column,values):
    kk = df.groupby(column)[values].apply(cons)
    kk = kk.fillna(0).astype('int32').reset_index()
    kk.columns = [column,'cons_' + values]
    data = data.merge(kk, on=column, how='left')
    return data

def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

def auc(y,pred):
#     fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return roc_auc_score(y, pred)

def f1(y,pred):
#     fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return f1_score(y, pred, average='macro')
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
#     y_hat[y_hat>0.45] = 1
#     y_hat[y_hat<=0.45] = 0
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat, average='macro'), True
def lgb_accuracy_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat)
    return 'accuracy', accuracy_score(y_true, y_hat), True

In [3]:
id_map = load_variavle('../data/id_map.pkl')
new_map = {v:k for k, v in id_map.items()}
col = ['活塞工作时长', '发动机转速', '油泵转速', '泵送压力', '液压油温', '流量档位', '分配压力', '排量电流',
       '低压开关', '高压开关', '搅拌超压信号', '正泵', '反泵', '设备类型', 'sample_file_name']

In [4]:
data_all = load_variavle('../data/data_all.pkl')
data_all['活塞工作时长'] = data_all['活塞工作时长'].replace(2098,1)
"""0.628"""
shebeileixing = {7: 573, 6: 44, 5: 78, 4: 63, 3: 9, 2: 4, 1: 252}
data_all['设备类型'] = data_all['设备类型'].map(shebeileixing)
data_all['液压油温'] = data_all['液压油温'] + 273.15

In [5]:
train_data = pd.read_csv('../data/train_labels.csv')
test_data = pd.read_csv('../data/submit_example.csv')
train_data['sample_file_name'] = train_data['sample_file_name'].map(new_map)
test_data['sample_file_name'] = test_data['sample_file_name'].map(new_map)

In [6]:
kk = data_all.groupby(['设备类型','sample_file_name']).size().reset_index().rename(columns={0:'length'})
train_data = train_data.merge(kk, on=['sample_file_name'], how='left')
test_data = test_data.merge(kk, on=['sample_file_name'], how='left')

In [7]:
agg_func = {
        '发动机转速': ['median', 'max', 'min', 'std'],
        '油泵转速': ['median', 'max', 'min', 'std'],
        '活塞工作时长': ['median', 'max', 'min'],
        '泵送压力': ['median', 'max', 'min', 'std'],
        '液压油温': ['median', 'max', 'min', 'std',],
        '流量档位': ['median', 'max', 'min', 'std','sum'],
        '分配压力': ['median', 'max', 'min', 'std','sum'],
        '排量电流': ['median', 'max', 'min', 'std','sum'],
}

In [8]:
new_columns = get_new_columns('id',agg_func)
df_group = data_all.groupby('sample_file_name').agg(agg_func)
df_group.columns = new_columns
df_group.reset_index(drop=False,inplace=True)
train_data = train_data.merge(df_group, on='sample_file_name', how='left')
test_data = test_data.merge(df_group, on='sample_file_name', how='left')
del df_group;gc.collect()

140

In [9]:
col = [i for i in train_data.columns if i not in ['sample_file_name', 'label']]
X_train = train_data[col]
y_train = train_data['label'].astype(int)
X_test = test_data[col]

In [10]:
K = 5
seed = 2021
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)

In [11]:
%%time
cat_pred_te_all = 0
cat_macro_f1_mean = 0
for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):

    print( '*'*50+'Fold_'+str(i)+'*'*50)
    y_tr, y_val = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()
    X_tr, X_val= X_train.iloc[train_index,:].copy(), X_train.iloc[test_index,:].copy()
    model = CatBoostClassifier(iterations=1111, #1111,1566
                               depth = 7,#7
                               loss_function='Logloss',#用于优化
                               custom_loss='Accuracy',#F1,显示训练
                               bagging_temperature=0.7,#1
                               od_type='Iter',
                               rsm = 0.67,#构建树时列采样0.67
                               od_wait=100,
                               l2_leaf_reg = 10,#11
                               thread_count = 6,
                               random_seed = 5,#5,9
                               metric_period = 200,
                              )
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val),
              use_best_model=True)
    pred = model.predict_proba(X_val)[:,1]
    pred_te = model.predict_proba(X_test)[:,1]
    print( " *****************macro_f1 = ", f1_score(y_val, np.round(pred), average='macro'))
    cat_pred_te_all = cat_pred_te_all + pred_te / K
    cat_macro_f1_mean = f1_score(y_val, np.round(pred), average='macro') / K + cat_macro_f1_mean
print( " *****************mean_macro_f1 = ", cat_macro_f1_mean )

**************************************************Fold_0**************************************************




0:	learn: 0.6908719	test: 0.6910319	best: 0.6910319 (0)	total: 125ms	remaining: 2m 18s
200:	learn: 0.6241799	test: 0.6460216	best: 0.6460216 (200)	total: 7.97s	remaining: 36.1s
400:	learn: 0.6014897	test: 0.6380389	best: 0.6380389 (400)	total: 15.9s	remaining: 28.2s
600:	learn: 0.5834113	test: 0.6336288	best: 0.6336288 (600)	total: 23.9s	remaining: 20.3s
800:	learn: 0.5656748	test: 0.6298729	best: 0.6298637 (797)	total: 31.9s	remaining: 12.3s
1000:	learn: 0.5501331	test: 0.6265477	best: 0.6265477 (1000)	total: 39.8s	remaining: 4.38s
1110:	learn: 0.5412543	test: 0.6242010	best: 0.6242010 (1110)	total: 44.2s	remaining: 0us

bestTest = 0.6242009594
bestIteration = 1110

 *****************macro_f1 =  0.6493846140832116
**************************************************Fold_1**************************************************




0:	learn: 0.6908835	test: 0.6909653	best: 0.6909653 (0)	total: 39.2ms	remaining: 43.5s
200:	learn: 0.6254056	test: 0.6400339	best: 0.6400339 (200)	total: 8.18s	remaining: 37s
400:	learn: 0.6026859	test: 0.6311272	best: 0.6311272 (400)	total: 16s	remaining: 28.3s
600:	learn: 0.5835234	test: 0.6250644	best: 0.6250644 (600)	total: 23.9s	remaining: 20.3s
800:	learn: 0.5662704	test: 0.6205636	best: 0.6205636 (800)	total: 31.8s	remaining: 12.3s
1000:	learn: 0.5503903	test: 0.6164259	best: 0.6164259 (1000)	total: 39.8s	remaining: 4.37s
1110:	learn: 0.5421846	test: 0.6145988	best: 0.6145988 (1110)	total: 44.2s	remaining: 0us

bestTest = 0.6145987623
bestIteration = 1110

 *****************macro_f1 =  0.662717480961247
**************************************************Fold_2**************************************************




0:	learn: 0.6909284	test: 0.6909205	best: 0.6909205 (0)	total: 35.2ms	remaining: 39.1s
200:	learn: 0.6256526	test: 0.6403400	best: 0.6403400 (200)	total: 7.95s	remaining: 36s
400:	learn: 0.6031455	test: 0.6323371	best: 0.6323371 (400)	total: 15.7s	remaining: 27.7s
600:	learn: 0.5837347	test: 0.6267205	best: 0.6267197 (599)	total: 23.8s	remaining: 20.2s
800:	learn: 0.5665004	test: 0.6223809	best: 0.6223785 (798)	total: 31.4s	remaining: 12.2s
1000:	learn: 0.5508736	test: 0.6189849	best: 0.6189849 (1000)	total: 39s	remaining: 4.28s
1110:	learn: 0.5425596	test: 0.6173364	best: 0.6173364 (1110)	total: 43.1s	remaining: 0us

bestTest = 0.6173364044
bestIteration = 1110

 *****************macro_f1 =  0.6634985945954237
**************************************************Fold_3**************************************************




0:	learn: 0.6909740	test: 0.6908437	best: 0.6908437 (0)	total: 36.2ms	remaining: 40.2s
200:	learn: 0.6260275	test: 0.6377575	best: 0.6377575 (200)	total: 7.73s	remaining: 35s
400:	learn: 0.6031200	test: 0.6292010	best: 0.6292010 (400)	total: 15.2s	remaining: 27s
600:	learn: 0.5836867	test: 0.6230792	best: 0.6230746 (598)	total: 22.8s	remaining: 19.3s
800:	learn: 0.5666210	test: 0.6191862	best: 0.6191862 (800)	total: 30.3s	remaining: 11.7s
1000:	learn: 0.5507204	test: 0.6154405	best: 0.6154405 (1000)	total: 37.8s	remaining: 4.16s
1110:	learn: 0.5428182	test: 0.6137257	best: 0.6137257 (1110)	total: 42.4s	remaining: 0us

bestTest = 0.6137256942
bestIteration = 1110

 *****************macro_f1 =  0.6667048756233589
**************************************************Fold_4**************************************************




0:	learn: 0.6909354	test: 0.6909434	best: 0.6909434 (0)	total: 37.9ms	remaining: 42s
200:	learn: 0.6255820	test: 0.6395733	best: 0.6395733 (200)	total: 8s	remaining: 36.2s
400:	learn: 0.6026029	test: 0.6316586	best: 0.6316586 (400)	total: 16s	remaining: 28.3s
600:	learn: 0.5834257	test: 0.6259837	best: 0.6259837 (600)	total: 23.9s	remaining: 20.3s
800:	learn: 0.5660606	test: 0.6213447	best: 0.6213379 (797)	total: 31.9s	remaining: 12.3s
1000:	learn: 0.5500564	test: 0.6168945	best: 0.6168945 (1000)	total: 39.8s	remaining: 4.37s
1110:	learn: 0.5417985	test: 0.6150635	best: 0.6150635 (1110)	total: 44.2s	remaining: 0us

bestTest = 0.6150635034
bestIteration = 1110

 *****************macro_f1 =  0.661893722454235
 *****************mean_macro_f1 =  0.6608398575434953


In [15]:
sub = test_data[['sample_file_name']].copy()
sub['label'] = cat_pred_te_all
sub['sample_file_name'] = sub['sample_file_name'].map(id_map)
sub = sub.sort_values('label', ascending=False).reset_index(drop=True)
print(sub['label'].round().value_counts())
sub.loc[:23000,'label'] = 1
sub.loc[23000:,'label'] = 0
sub['label'] = sub['label'].astype(int)
# 19946

0.0    32304
1.0    19946
Name: label, dtype: int64


In [18]:
sub.to_csv('../sub/cat.csv', index=False)