In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
train_transaction = pd.read_csv('../input/train_transaction.csv')
test_transaction = pd.read_csv('../input/test_transaction.csv')
train_identity = pd.read_csv('../input/train_identity.csv')
test_identity = pd.read_csv('../input/test_identity.csv')

In [3]:
train = train_transaction.merge(train_identity, on='TransactionID', how='left')
test = test_transaction.merge(test_identity, on='TransactionID', how='left')
data = pd.concat([train, test], axis=0, ignore_index=True)
del train, test
gc.collect()

125

In [4]:
object_cols = ['ProductCD', 'card4', 'card6', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain']
M_cols = ['M{}'.format(i) for i in range(1, 10)]
id_cols = ['id_12', 'id_16', 'id_27', 'id_28', 'id_29', 'id_35', 'id_36', 'id_37', 'id_38', 'id_15',
           'id_23', 'id_34', 'id_30', 'id_31', 'id_33']
cat_cols = object_cols + M_cols + id_cols

In [5]:
for i in cat_cols:
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i].astype(str))

In [6]:
train = data[data['isFraud'].notnull()]
test = data[data['isFraud'].isnull()]

y = train['isFraud']
train.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
test.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
used_cols = train.columns
test = test[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train, y, random_state=2020)

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
    # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}
valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    num_boost_round=1000000,
    early_stopping_rounds=200,
    verbose_eval=300
)
pred = valid_model.predict(test)
sub = pd.DataFrame({'id': range(len(test))})
sub['isFraud'] = pred
sub.to_csv('../sub/basline.csv', index=False, header=None)

Training until validation scores don't improve for 200 rounds
[300]	training's auc: 0.979346	valid_1's auc: 0.947942
[600]	training's auc: 0.992149	valid_1's auc: 0.955309
[900]	training's auc: 0.996666	valid_1's auc: 0.958141
[1200]	training's auc: 0.998485	valid_1's auc: 0.959828
[1500]	training's auc: 0.999324	valid_1's auc: 0.960452
Early stopping, best iteration is:
[1523]	training's auc: 0.999353	valid_1's auc: 0.960544
Evaluated only: auc
