In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [17]:
# モデル
import lightgbm as lgb

# データ

In [38]:
folder_path = './ieee-fraud-detection'
train_identity = pd.read_csv(f'{folder_path}/train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}/train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}/test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}/test_transaction.csv')
sample_sub = pd.read_csv(f'{folder_path}/sample_submission.csv')

train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [5]:
train = train.set_index('TransactionID')
test = test.set_index('TransactionID')

## 小さいデータで

In [42]:
train_random = train.sample(frac=0.01, random_state=0).sort_values('TransactionID')
train_random = train_random.iloc[:, :16]
test_random = test.sample(frac=0.01, random_state=0).sort_values('TransactionID')
test_random = test_random.iloc[:, :15]

In [43]:
target = train_random['isFraud'].values
train_random = train_random.drop('isFraud', axis=1)

In [44]:
train_random.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain
31,2987031,86998,363.89,W,6573,583.0,150.0,visa,226.0,credit,315.0,87.0,13.0,,yahoo.com
57,2987057,87445,50.0,H,11839,490.0,150.0,visa,226.0,debit,204.0,87.0,,,gmail.com
271,2987271,90635,50.0,R,9596,369.0,150.0,mastercard,224.0,debit,184.0,87.0,,,comcast.net
433,2987433,93676,100.0,R,17364,399.0,150.0,american express,150.0,credit,299.0,87.0,,,anonymous.com
477,2987477,94463,100.0,W,3570,512.0,150.0,visa,226.0,debit,299.0,87.0,,,yahoo.com


In [45]:
test_random.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain
168,3663717,18407633,88.5,W,15377,555.0,150.0,visa,226.0,debit,264.0,87.0,14.0,,anonymous.com
284,3663833,18412314,107.95,W,7919,194.0,150.0,mastercard,166.0,debit,476.0,87.0,,,yahoo.com
327,3663876,18414254,335.0,W,7585,553.0,150.0,visa,226.0,credit,191.0,87.0,,,anonymous.com
524,3664073,18443207,35.95,W,5033,269.0,150.0,mastercard,224.0,debit,264.0,87.0,1.0,,yahoo.com
634,3664183,18451810,25.95,W,18109,543.0,150.0,mastercard,224.0,debit,264.0,87.0,,,gmail.com


In [27]:
train_random.shape

(5905, 14)

In [28]:
test_random.shape

(5067, 14)

# 欠損値処理

In [46]:
def missing_table(df):
    null_val = df.isnull().sum()
    percent = 100 * null_val / len(df)
    missing = pd.concat([null_val, percent], axis=1)
    missing_columns = missing.rename(columns={0:"missing", 1:"%"})
    return missing_columns

missing_table(train_random)

Unnamed: 0,missing,%
TransactionID,0,0.0
TransactionDT,0,0.0
TransactionAmt,0,0.0
ProductCD,0,0.0
card1,0,0.0
card2,91,1.541067
card3,18,0.304826
card4,19,0.321761
card5,47,0.795936
card6,18,0.304826


In [47]:
# とりあえず適当に埋める
train_random = train_random.fillna(0)
test_random = test_random.fillna(0)

In [48]:
missing_table(train_random)

Unnamed: 0,missing,%
TransactionID,0,0.0
TransactionDT,0,0.0
TransactionAmt,0,0.0
ProductCD,0,0.0
card1,0,0.0
card2,0,0.0
card3,0,0.0
card4,0,0.0
card5,0,0.0
card6,0,0.0


# カテゴリ変数変換

In [32]:
categorical_list = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain']

In [62]:
for i in categorical_list:
    lbl = LabelEncoder()
    lbl.fit(list(train_random[i].astype(str).values) + list(test_random[i].astype(str).values))
    train_random[i] = lbl.transform(list(train_random[i].astype(str).values))
    test_random[i] = lbl.transform(list(test_random[i].astype(str).values))

In [63]:
train_random.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain
31,2987031,86998,363.89,4,2100,439,22,4,46,1,42,10,13.0,0.0,42
57,2987057,87445,50.0,1,289,355,22,4,46,2,20,10,0.0,0.0,55
271,2987271,90635,50.0,2,2560,246,22,3,45,2,15,10,0.0,0.0,52
433,2987433,93676,100.0,2,1217,273,22,1,23,1,38,10,0.0,0.0,4
477,2987477,94463,100.0,4,1644,376,22,4,46,2,38,10,0.0,0.0,42


# cv

In [50]:
# train_randomで（実際の解析はtrainに変更）
n_train = train_random.shape[0]
n_test = test_random.shape[0]
seed = 0
n_fold = 5
kf = TimeSeriesSplit(n_splits=n_fold)

In [36]:
params = {'num_leaves': 41,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 10,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.0883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [67]:
oof_train = np.zeros(n_train)
oof_test = np.zeros(n_test)
oof_test_i = np.zeros((n_test, n_fold))
for i, (train_index, valid_index) in enumerate(kf.split(train_random, target)):
    X_train = train_random.iloc[train_index]
    y_train = target[train_index]
    X_valid = train_random.iloc[valid_index]
    y_valid = target[valid_index]
    
    """
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
    clf = lgb.train(params, train_data, 10000, valid_sets=[train_data, valid_data], verbose_eval=1000, early_stopping_rounds=500)
    """

    clf = lgb.LGBMClassifier(**params, num_boost_round=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)[:, 1]
    
    oof_train[valid_index] = y_pred
    oof_test_i[:, i] = clf.predict_proba(test_random)[:, 1]
    
oof_test = oof_test_i.mean(axis=1)
oof_train = pd.DataFrame(oof_train)
oof_test = pd.DataFrame(oof_test)

oof_train.to_csv('oof_train_random.csv')
oof_test.to_csv('oof_test_random.csv')