In [1]:
import time
import sys
import os
import re
import gc
import datetime
import itertools
import pickle
import random
import numpy as np 
import pandas as pd 
import lightgbm as lgb
# import xgboost as xgb
# import catboost
from tqdm import *
from scipy import stats
import warnings
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold,train_test_split,StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,MinMaxScaler
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, f1_score, log_loss,roc_auc_score
import seaborn as sns
color = sns.color_palette()
sns.set(style="whitegrid")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
%matplotlib inline

In [21]:
lgb.__version__

'2.1.1'

In [3]:
def save_variable(v,filename):
    f=open(filename,'wb')
    pickle.dump(v,f)
    f.close()
    return filename
 
def load_variavle(filename):
    f=open(filename,'rb')
    r=pickle.load(f)
    f.close()
    return r

def static_fe(data1,data2,df,column,values,cc,c):
    addn = df[[column,values]].copy()
    addn = addn.groupby(column)[values].agg(cc).reset_index()
    addn.columns = [column] + [c+values+'_'+i for i in cc]
    data1 = data1.merge(addn,on=column,how='left')
    data2 = data2.merge(addn,on=column,how='left')
    return data1,data2

def cons(x):
    num_times = [(k, len(list(v))) for k, v in itertools.groupby(list(x))]
    num_times = pd.DataFrame(num_times)
    num_times = num_times[num_times[0] == 1][1]
    return num_times.max()

def cons_fe(data,df,column,values):
    kk = df.groupby(column)[values].apply(cons)
    kk = kk.fillna(0).astype('int32').reset_index()
    kk.columns = [column,'cons_' + values]
    data = data.merge(kk, on=column, how='left')
    return data

def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

def auc(y,pred):
#     fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return roc_auc_score(y, pred)

def f1(y,pred):
#     fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return f1_score(y, pred,average='macro')

In [4]:
id_map = load_variavle('../data/id_map.pkl')
new_map = {v:k for k, v in id_map.items()}

In [5]:
col = ['活塞工作时长', '发动机转速', '油泵转速', '泵送压力', '液压油温', '流量档位', '分配压力', '排量电流',
       '低压开关', '高压开关', '搅拌超压信号', '正泵', '反泵', '设备类型', 'sample_file_name']

In [6]:
data_all = load_variavle('../data/data_all.pkl')
data_all['活塞工作时长'] = data_all['活塞工作时长'].replace(2098,1)

In [7]:
train_data = pd.read_csv('../data/train_labels.csv')
test_data = pd.read_csv('../data/submit_example.csv')
train_data['sample_file_name'] = train_data['sample_file_name'].map(new_map)
test_data['sample_file_name'] = test_data['sample_file_name'].map(new_map)

In [8]:
kk = data_all.groupby(['设备类型','sample_file_name']).size().reset_index().rename(columns={0:'length'})
train_data = train_data.merge(kk, on=['sample_file_name'], how='left')
test_data = test_data.merge(kk, on=['sample_file_name'], how='left')

In [9]:
agg_func = {
#     """0.6269"""
            '发动机转速': ['median', 'max', 'min', 'std',],
        '油泵转速': ['median', 'max', 'min', 'std',],
        '活塞工作时长': ['mean', 'max', 'min'],
        '泵送压力': ['median',  'max', 'min', 'std',],
        '液压油温': ['median',  'max', 'min', 'std'],
        '流量档位': ['median',  'max', 'min', 'std','sum'],
        '分配压力': ['median',  'max', 'min', 'std','sum'],
        '排量电流': ['median',  'max', 'min', 'std','sum'],
}

In [10]:
new_columns = get_new_columns('id',agg_func)
df_group = data_all.groupby('sample_file_name').agg(agg_func)
df_group.columns = new_columns
df_group.reset_index(drop=False,inplace=True)
train_data = train_data.merge(df_group, on='sample_file_name', how='left')
test_data = test_data.merge(df_group, on='sample_file_name', how='left')
del df_group;gc.collect()

140

In [11]:
col = [i for i in train_data.columns if i not in ['sample_file_name', 'label']]
X_train = train_data[col]
y_train = train_data['label'].astype(int)
X_test = test_data[col]

In [12]:
scaler = MinMaxScaler()
scaler.fit(X_train.fillna(0))
X_train[X_train.columns] = scaler.transform(X_train.fillna(0))
X_test[X_test.columns] = scaler.transform(X_test.fillna(0)) 

In [13]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat, average='macro'), True

In [14]:
K = 5
seed = 2019
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)
lgb_params = {
                        'boosting_type': 'gbdt',
                        'objective': 'binary',
                        'metric': 'binary_logloss',#auc
                        'num_leaves': 2**7,#2**7+7
                        'subsample': 0.8,#0.7,0.8
                        'colsample_bytree': 0.7,#0.5,0.7
                        'learning_rate': 0.01,#0.05
                        'seed': 2017,#2017
                        'nthread': 6,
                        'silent': True
             }

In [15]:
%%time
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
feature_importance_df = pd.DataFrame()

for i, (train_index, val_index) in enumerate(skf.split(X_train,y_train)):
    print("fold {}".format(i))
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    lgb_train = lgb.Dataset(X_tr,y_tr)
    lgb_val = lgb.Dataset(X_val,y_val)
    num_round = 3000
    clf = lgb.train(lgb_params, lgb_train, num_round, valid_sets = [lgb_train, lgb_val],
                    feval=lgb_f1_score, verbose_eval=250, early_stopping_rounds = 100)
    oof[val_index] = clf.predict(X_val, num_iteration=clf.best_iteration)
    print('best iteration = ',clf.best_iteration)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = clf.feature_name()
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / skf.n_splits
f1_score(y_train, np.round(oof), average='macro')

fold 0
Training until validation scores don't improve for 100 rounds.
[250]	training's binary_logloss: 0.573437	training's f1: 0.74627	valid_1's binary_logloss: 0.616356	valid_1's f1: 0.676255
[500]	training's binary_logloss: 0.522036	training's f1: 0.794122	valid_1's binary_logloss: 0.602536	valid_1's f1: 0.6824
[750]	training's binary_logloss: 0.483385	training's f1: 0.834137	valid_1's binary_logloss: 0.597312	valid_1's f1: 0.685102
[1000]	training's binary_logloss: 0.449666	training's f1: 0.867431	valid_1's binary_logloss: 0.592479	valid_1's f1: 0.687886
[1250]	training's binary_logloss: 0.419551	training's f1: 0.896375	valid_1's binary_logloss: 0.588463	valid_1's f1: 0.689196
[1500]	training's binary_logloss: 0.392175	training's f1: 0.921475	valid_1's binary_logloss: 0.585017	valid_1's f1: 0.691418
Early stopping, best iteration is:
[1408]	training's binary_logloss: 0.401957	training's f1: 0.913039	valid_1's binary_logloss: 0.58649	valid_1's f1: 0.6923
best iteration =  1408
fold 1

0.6908631209696624

In [18]:
sub = test_data[['sample_file_name']].copy()
sub['label'] = predictions
sub['sample_file_name'] = sub['sample_file_name'].map(id_map)
print(sub['label'].round().value_counts())
sub = sub.sort_values('label', ascending=False).reset_index(drop=True)
sub.loc[:25000, 'label'] = 1
sub.loc[25000:, 'label'] = 0
sub['label'] = sub['label'].astype(int)

0.0    34005
1.0    18245
Name: label, dtype: int64


In [20]:
print(sub['label'].value_counts())
sub.to_csv('../sub/lgb.csv', index=False)

0    27250
1    25000
Name: label, dtype: int64
