In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_transaction = pd.read_csv('../input/train_transaction.csv')
test_transaction = pd.read_csv('../input/test_transaction.csv')

In [3]:
train_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,0,0,5069957,440.95,W,13809,583.0,150.0,visa,226.0,...,,,,,,,,,,
1,1,0,14773564,460.0,W,12695,490.0,150.0,visa,226.0,...,,,,,,,,,,
2,2,0,9207277,49.0,W,12695,490.0,150.0,visa,226.0,...,,,,,,,,,,
3,3,0,3874678,35.95,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,4,0,6225517,112.99,W,17399,111.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [4]:
train_transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472432 entries, 0 to 472431
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.4+ GB


In [5]:
test_transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118108 entries, 0 to 118107
Columns: 393 entries, TransactionID to V339
dtypes: float64(376), int64(3), object(14)
memory usage: 354.1+ MB


In [6]:
train_transaction_num = train_transaction.select_dtypes(exclude=['object'])
test_transaction_num = test_transaction.select_dtypes(exclude=['object'])
del train_transaction, test_transaction
gc.collect()

125

In [7]:
y = train_transaction_num['isFraud']
train_transaction_num.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
test_transaction_num.drop(['TransactionID'], axis=1, inplace=True)
used_cols = train_transaction_num.columns
test_transaction_num = test_transaction_num[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train_transaction_num, y, random_state=2020)

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
    # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}
valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    num_boost_round=1000000,
    early_stopping_rounds=200,
    verbose_eval=300
)
pred = valid_model.predict(test_transaction_num)
sub = pd.DataFrame({'id': range(len(test_transaction_num))})
sub['isFraud'] = pred
sub.to_csv('../sub/basline.csv', index=False, header=None)

Training until validation scores don't improve for 200 rounds
[300]	training's auc: 0.97499	valid_1's auc: 0.9412
[600]	training's auc: 0.989607	valid_1's auc: 0.949198
[900]	training's auc: 0.994745	valid_1's auc: 0.951356
[1200]	training's auc: 0.997136	valid_1's auc: 0.952414
[1500]	training's auc: 0.998371	valid_1's auc: 0.952847
[1800]	training's auc: 0.999055	valid_1's auc: 0.953045
Early stopping, best iteration is:
[1750]	training's auc: 0.998984	valid_1's auc: 0.953101
Evaluated only: auc
