In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
train_transaction = pd.read_csv('../input/train_transaction.csv')
test_transaction = pd.read_csv('../input/test_transaction.csv')
train_identity = pd.read_csv('../input/train_identity.csv')
test_identity = pd.read_csv('../input/test_identity.csv')

In [3]:
train = train_transaction.merge(train_identity, on='TransactionID', how='left')
test = test_transaction.merge(test_identity, on='TransactionID', how='left')
data = pd.concat([train, test], axis=0, ignore_index=True)
del train, test
gc.collect()

125

In [4]:
object_cols = ['ProductCD', 'card4', 'card6', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain']
M_cols = ['M{}'.format(i) for i in range(1, 10)]
id_cols = ['id_12', 'id_16', 'id_27', 'id_28', 'id_29', 'id_35', 'id_36', 'id_37', 'id_38', 'id_15',
           'id_23', 'id_34', 'id_30', 'id_31', 'id_33']
cat_cols = object_cols + M_cols + id_cols

In [5]:
for i in cat_cols:
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i].astype(str))

In [6]:
train = data[data['isFraud'].notnull()]
test = data[data['isFraud'].isnull()]

y = train['isFraud']
train.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
test.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
used_cols = train.columns
test = test[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train, y, random_state=2020)

In [7]:
def auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52):
    """
    基于AUC的单特征筛选
    @param X_train:
    @param y_train:
    @param X_valid:
    @param y_valid:
    @param cols:
    @param threshold:
    @return:
    """
    useful_dict = dict()
    useless_dict = dict()
    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'metric': 'auc',
        'learning_rate': 0.1,
        'num_leaves': 31,
        'lambda_l1': 0,
        'lambda_l2': 1,
        'num_threads': 23,
        'min_data_in_leaf': 20,
        'first_metric_only': True,
        'is_unbalance': True,
        'max_depth': -1,
        'seed': 2020
    }
    for i in cols:
        print(i)
        try:
            lgb_train = lgb.Dataset(X_train[[i]].values, y_train)
            lgb_valid = lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
            lgb_model = lgb.train(
                params,
                lgb_train,
                valid_sets=[lgb_valid, lgb_train],
                num_boost_round=1000,
                early_stopping_rounds=50,
                verbose_eval=500
            )
            print('*' * 10)
            print(lgb_model.best_score['valid_0']['auc'])
            if lgb_model.best_score['valid_0']['auc'] > threshold:
                useful_dict[i] = lgb_model.best_score['valid_0']['auc']
            else:
                useless_dict[i] = lgb_model.best_score['valid_0']['auc']
        except:
            print('Error: ', i)
    useful_cols = list(useful_dict.keys())
    useless_cols = list(useless_dict.keys())
    return useful_dict, useless_dict, useful_cols, useless_cols


useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, used_cols, threshold=0.52)

TransactionDT
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[49]	training's auc: 0.595394	valid_0's auc: 0.576889
Evaluated only: auc
**********
0.5768888775675678
TransactionAmt
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[141]	training's auc: 0.718328	valid_0's auc: 0.707581
Evaluated only: auc
**********
0.7075814699342213
ProductCD
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.678102	valid_0's auc: 0.679011
Evaluated only: auc
**********
0.6790110038541265
card1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[191]	training's auc: 0.702453	valid_0's auc: 0.691196
Evaluated only: auc
**********
0.6911962587253155
card2
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[82]	training's auc: 0.734264	valid_0's auc: 0.73187

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[51]	training's auc: 0.640092	valid_0's auc: 0.639659
Evaluated only: auc
**********
0.6396593867752118
D13
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	training's auc: 0.622811	valid_0's auc: 0.625083
Evaluated only: auc
**********
0.6250832391256335
D14
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	training's auc: 0.632946	valid_0's auc: 0.6327
Evaluated only: auc
**********
0.6327002949799455
D15
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	training's auc: 0.648593	valid_0's auc: 0.638329
Evaluated only: auc
**********
0.6383292004767112
M1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.621833	valid_0's auc: 0.621232
Evaluated only: auc
**********
0.62123

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[7]	training's auc: 0.639063	valid_0's auc: 0.637105
Evaluated only: auc
**********
0.637105017880011
V30
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.63958	valid_0's auc: 0.637141
Evaluated only: auc
**********
0.6371411604665358
V31
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.668922	valid_0's auc: 0.668883
Evaluated only: auc
**********
0.6688830513065875
V32
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	training's auc: 0.668915	valid_0's auc: 0.668912
Evaluated only: auc
**********
0.6689119718511017
V33
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	training's auc: 0.680044	valid_0's auc: 0.682395
Evaluated only: auc
**********
0.682395

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	training's auc: 0.650024	valid_0's auc: 0.647008
Evaluated only: auc
**********
0.6470082683138061
V71
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[12]	training's auc: 0.66372	valid_0's auc: 0.664957
Evaluated only: auc
**********
0.6649573027967642
V72
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[12]	training's auc: 0.66536	valid_0's auc: 0.666031
Evaluated only: auc
**********
0.6660306298342316
V73
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.676029	valid_0's auc: 0.678028
Evaluated only: auc
**********
0.6780281575668468
V74
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's auc: 0.682515	valid_0's auc: 0.68384
Evaluated only: auc
**********
0.683839

Early stopping, best iteration is:
[24]	training's auc: 0.516073	valid_0's auc: 0.519146
Evaluated only: auc
**********
0.5191463999614072
V111
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.516982	valid_0's auc: 0.519593
Evaluated only: auc
**********
0.5195934020635697
V112
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.517688	valid_0's auc: 0.520571
Evaluated only: auc
**********
0.5205712028432642
V113
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.516918	valid_0's auc: 0.519993
Evaluated only: auc
**********
0.5199931104956866
V114
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	training's auc: 0.520781	valid_0's auc: 0.523119
Evaluated only: auc
**********
0.5231193867879536
V115
Training until validation scores don't i

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.534316	valid_0's auc: 0.532564
Evaluated only: auc
**********
0.5325640662751391
V152
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[87]	training's auc: 0.535044	valid_0's auc: 0.532889
Evaluated only: auc
**********
0.5328888806172225
V153
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[6]	training's auc: 0.525905	valid_0's auc: 0.523867
Evaluated only: auc
**********
0.5238672070446022
V154
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.52625	valid_0's auc: 0.524386
Evaluated only: auc
**********
0.5243859219873456
V155
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	training's auc: 0.526419	valid_0's auc: 0.524269
Evaluated only: auc
**********
0.5

Early stopping, best iteration is:
[3]	training's auc: 0.660725	valid_0's auc: 0.660108
Evaluated only: auc
**********
0.6601076488443276
V192
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.662591	valid_0's auc: 0.66138
Evaluated only: auc
**********
0.6613804922337043
V193
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[22]	training's auc: 0.66198	valid_0's auc: 0.661267
Evaluated only: auc
**********
0.6612665740511582
V194
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	training's auc: 0.661692	valid_0's auc: 0.660877
Evaluated only: auc
**********
0.6608770548080298
V195
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.662337	valid_0's auc: 0.661805
Evaluated only: auc
**********
0.6618051148830536
V196
Training until validation scores don't imp

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's auc: 0.66338	valid_0's auc: 0.662613
Evaluated only: auc
**********
0.6626133830569098
V233
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[28]	training's auc: 0.662445	valid_0's auc: 0.661803
Evaluated only: auc
**********
0.6618032098932305
V234
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.664914	valid_0's auc: 0.663844
Evaluated only: auc
**********
0.6638437808917886
V235
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	training's auc: 0.647824	valid_0's auc: 0.647294
Evaluated only: auc
**********
0.6472937765747516
V236
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[23]	training's auc: 0.650874	valid_0's auc: 0.650347
Evaluated only: auc
**********
0

Early stopping, best iteration is:
[101]	training's auc: 0.661335	valid_0's auc: 0.657103
Evaluated only: auc
**********
0.6571028835394394
V273
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[90]	training's auc: 0.661575	valid_0's auc: 0.660954
Evaluated only: auc
**********
0.6609538309110353
V274
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	training's auc: 0.663872	valid_0's auc: 0.663123
Evaluated only: auc
**********
0.663123357396676
V275
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[44]	training's auc: 0.663045	valid_0's auc: 0.662282
Evaluated only: auc
**********
0.6622819921134466
V276
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[22]	training's auc: 0.648954	valid_0's auc: 0.648036
Evaluated only: auc
**********
0.6480359957017832
V277
Training until validation scores don'

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[91]	training's auc: 0.57234	valid_0's auc: 0.563655
Evaluated only: auc
**********
0.5636551493008097
V314
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[84]	training's auc: 0.567817	valid_0's auc: 0.562744
Evaluated only: auc
**********
0.5627437839967967
V315
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[114]	training's auc: 0.575289	valid_0's auc: 0.570412
Evaluated only: auc
**********
0.57041172208751
V316
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	training's auc: 0.568346	valid_0's auc: 0.57239
Evaluated only: auc
**********
0.572389680122861
V317
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	training's auc: 0.60789	valid_0's auc: 0.613328
Evaluated only: auc
**********
0.61

Early stopping, best iteration is:
[2]	training's auc: 0.524524	valid_0's auc: 0.524213
Evaluated only: auc
**********
0.5242127642610708
id_15
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.668781	valid_0's auc: 0.667841
Evaluated only: auc
**********
0.6678412641746413
id_16
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.651361	valid_0's auc: 0.650077
Evaluated only: auc
**********
0.6500770721910619
id_17
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[84]	training's auc: 0.673586	valid_0's auc: 0.673732
Evaluated only: auc
**********
0.6737321987239012
id_18
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	training's auc: 0.555859	valid_0's auc: 0.551536
Evaluated only: auc
**********
0.5515355915128651
id_19
Training until validation scores d

In [8]:
X_train = X_train[useful_cols]
X_valid = X_valid[useful_cols]

In [9]:
dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
    # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}
valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    num_boost_round=1000000,
    early_stopping_rounds=200,
    verbose_eval=300
)
pred = valid_model.predict(test)
sub = pd.DataFrame({'id': range(len(test))})
sub['isFraud'] = pred
sub.to_csv('../sub/basline.csv', index=False, header=None)

Training until validation scores don't improve for 200 rounds
[300]	training's auc: 0.978573	valid_1's auc: 0.946165
[600]	training's auc: 0.991448	valid_1's auc: 0.953023
[900]	training's auc: 0.996447	valid_1's auc: 0.956686
[1200]	training's auc: 0.99837	valid_1's auc: 0.958197
[1500]	training's auc: 0.999275	valid_1's auc: 0.95968
[1800]	training's auc: 0.999699	valid_1's auc: 0.960455
[2100]	training's auc: 0.999865	valid_1's auc: 0.961002
[2400]	training's auc: 0.999947	valid_1's auc: 0.961269
[2700]	training's auc: 0.99998	valid_1's auc: 0.961845
[3000]	training's auc: 0.999993	valid_1's auc: 0.962234
Early stopping, best iteration is:
[2951]	training's auc: 0.999992	valid_1's auc: 0.962304
Evaluated only: auc


LightGBMError: The number of features in data (432) is not the same as it was in training data (383).