In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score,average_precision_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

train_x = pd.read_csv('data/creditcard_train.csv')
test_x = pd.read_csv('data/creditcard_test.csv')
res = test_x[['Index']]
del train_x['Index']
del test_x['Index']

test_x['Class'] = -1
data = pd.concat([train_x,test_x],axis=0,ignore_index=True)
train_index = data[data['Class']!=-1].index.tolist()
test_index = data[data['Class']==-1].index.tolist()
data['Time'] = data['Time'].astype(int)
data['Day'] = (data['Time']//(3600*24)).astype(int)
data['Hour'] = ((data['Time']-data['Day']*24*3600)//(3600)).astype(int)

pca_cols = []
for i in range(1,29):
    pca_cols.append('V'+str(i))
def getFeature(x):
    return ' '.join(x.astype(str).values.tolist())
se  = data[pca_cols].apply(getFeature,axis=1)
st = pd.Series(se.drop_duplicates().values)

data['Id'] = se.map(pd.Series(st.index,index=st.values))

for co in ['Id','Amount']:
    data[co+'_ount'] = data[co].map(data[co].value_counts())

train_x = data.drop(['Id'],axis=1).loc[train_index].reset_index(drop=True)
test_x = data.drop(['Id'],axis=1).loc[test_index].reset_index(drop=True)
train_y = train_x.pop('Class')
del test_x['Class']

params_initial_lgb = {
        'num_leaves':20, 
        'learning_rate':0.01, 
    'boosting':'gbdt',
    'min_child_samples':10,

    'bagging_fraction':0.7, 
    'bagging_freq':1,
    'feature_fraction':0.7, 

    'reg_alpha':0,
    'reg_lambda':1, 
    'metric':'binary_logloss',
    'objective':'binary'
}
NBR = 20000
VBE = 100
ESR = 200
def my_score(preds,data_vali):
    labels = data_vali.get_label()
    return 'AP',average_precision_score(labels, preds),True
def searchBestCut(ytrue,ypre):
    lst = []
    for i in range(10,81,1):
        lst.append(f1_score(ytrue,(pd.Series(ypre)>=i*0.01).astype(int).values))
    se = pd.Series(lst,index=range(10,81,1)).sort_values()
    return (se.index[-1])*0.01,se.values[-1]
print(train_x.shape)
print(test_x.shape)
prob = pd.DataFrame()
cate = pd.DataFrame()
num = 0
apscore = []
fscore = []
skf = StratifiedKFold(n_splits=5,random_state=2020,shuffle=True)
for train_part_index,evals_index in skf.split(train_x,train_y):
    EVAL_RESULT = {}
    train_part = lgb.Dataset(train_x.loc[train_part_index],label=train_y.loc[train_part_index])
    evals = lgb.Dataset(train_x.loc[evals_index],label=train_y.loc[evals_index])
    bst = lgb.train(params_initial_lgb,train_part, 
              num_boost_round=NBR, valid_sets=[train_part,evals], feval = my_score,
              valid_names=['train','evals'],  early_stopping_rounds=ESR,
              evals_result=EVAL_RESULT, verbose_eval=False)
    num+=1
    lst = EVAL_RESULT['evals']['AP']
    best_score = max(lst)
    best_iter = lst.index(best_score)+1
    apscore.append(best_score)
    test_ypre = bst.predict(test_x,num_iteration = best_iter)
    prob['prob_'+str(num)]=test_ypre
    evals_ypre = bst.predict(train_x.loc[evals_index],num_iteration = best_iter)
    cut,best_score = searchBestCut(train_y.loc[evals_index],evals_ypre)
    fscore.append(best_score)
    cate['cate_'+str(num)] = (pd.Series(test_ypre)>=cut).astype(int)
    print(pd.Series(bst.feature_importance(),index=train_x.columns).sort_values(ascending=False).head(10))
    print('\n')
print(apscore)
print(sum(apscore)/5)
print(fscore)
print(sum(fscore)/5)
res['Pred'] = prob.mean(1)
res['Class'] = (cate.mean(1)>=0.6).astype(int)
res.to_csv('result/Test_Pred_Class.csv',index=False)

(256327, 34)
(28480, 34)
[0.8922402932678718, 0.8293103070084102, 0.8584163956733923, 0.8235122399170315, 0.9304132631688455]
0.8667784998071102
[0.8863636363636365, 0.8374999999999999, 0.8757396449704142, 0.8484848484848484, 0.927710843373494]
0.8751597946384786
