## Test for Feature Eng - LightGBM

In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import datetime
import missingno as msno
import lightgbm as lgb
import xgboost as xgb
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import KFold, TimeSeriesSplit, train_test_split,StratifiedKFold
import gc
from statistics import mean

# Any results you write to the current directory are saved as output.

### Setup Cross Validation
1. Divide Train set in subsets (Training set itself + Local Test set)
2. Define Validation Metric (in our case it is ROC-AUC)
3. Stop training when Validation metric stops improving
4. Take average of each fold's prediction for the Local Test set.

In [9]:
train_full = pd.read_pickle('data/train_feat.pkl')
test_full = pd.read_pickle('data/test_feat.pkl')

# Label Encoding for categoricals
for f in test_full.columns:
    if train_full[f].dtype=='object' or test_full[f].dtype=='object': 
        train_full[f] = train_full[f].fillna('unseen_before_label')
        test_full[f]  = test_full[f].fillna('unseen_before_label')
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_full[f].values) + list(test_full[f].values))
        train_full[f] = lbl.transform(list(train_full[f].values))
        test_full[f] = lbl.transform(list(test_full[f].values)) 

# Fill NA's for numerics
train_full = train_full.fillna(-999)
test_full = test_full.fillna(-999)

In [10]:
rm_cols = [
    'TransactionID','TransactionDT', 
    'isFraud'                         
]

# Final features
features_columns = [col for col in list(train_full.columns) if col not in rm_cols]

In [11]:
X = train_full[features_columns]
y = train_full['isFraud']

del train_full
gc.collect()

54

In [12]:
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8, # reduce number of leaves to reduce overfitting
                    'max_depth': 8, # max_depth should be constrained, -1 would mean unconstrained
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':10000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 2019,
                    'early_stopping_rounds':100, 
                } 

In [None]:
NFOLDS =5
folds = StratifiedKFold(n_splits=NFOLDS,random_state=123,shuffle=True) # split by stratified folds
# folds = TimeSeriesSplit(n_splits=NFOLDS) # split by time

aucs = []
clfs=[]
pred_len = len(test_full)
prediction = np.zeros(pred_len)

for fold, (trn_idx, test_idx) in enumerate(folds.split(X,y)):
    print('Training on fold {}'.format(fold + 1))
    
    trn_data = lgb.Dataset(data=X.iloc[trn_idx], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(data=X.iloc[test_idx], label=y.iloc[test_idx])
    clf = lgb.train(params, 
                    trn_data, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval=200)
    
    print('AUC for fold {}: {}'.format(fold+1, clf.best_score['valid_1']['auc']))
    
    aucs.append(clf.best_score['valid_1']['auc'])
#     clfs.append(clf)
    
    prediction += clf.predict(test_full[features_columns])

print("Cross Validation AUC: ", sum(aucs)/NFOLDS)
final_predictions = prediction/NFOLDS

Training on fold 1




Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.955017	valid_1's auc: 0.932698
[400]	training's auc: 0.982734	valid_1's auc: 0.954984
[600]	training's auc: 0.992764	valid_1's auc: 0.963048
[800]	training's auc: 0.996815	valid_1's auc: 0.967098
[1000]	training's auc: 0.99849	valid_1's auc: 0.969431
[1200]	training's auc: 0.999288	valid_1's auc: 0.971062
[1400]	training's auc: 0.99968	valid_1's auc: 0.972209
[1600]	training's auc: 0.999863	valid_1's auc: 0.973113
[1800]	training's auc: 0.999942	valid_1's auc: 0.973793
[2000]	training's auc: 0.999976	valid_1's auc: 0.974383
[2200]	training's auc: 0.999991	valid_1's auc: 0.974652
[2400]	training's auc: 0.999997	valid_1's auc: 0.974979
[2600]	training's auc: 0.999999	valid_1's auc: 0.975283
[2800]	training's auc: 1	valid_1's auc: 0.975492
[3000]	training's auc: 1	valid_1's auc: 0.975617
[3200]	training's auc: 1	valid_1's auc: 0.975861
[3400]	training's auc: 1	valid_1's auc: 0.975933
Early stopping, bes

Looks like this model is overfitting on the training set. Training AUC is getting up to 1 and validation auc is almost 0.98, when a realistic leaderboard number for a good model is 0.94-0.96. Is there data leakage somewhere?

In [None]:
fig, ax = plt.subplots(figsize=(15, 20))
lgb.plot_importance(clf,max_num_features=50,ax=ax)
# for i in range(NFOLDS):
#     fig, ax = plt.subplots(figsize=(15, 20))
#     xgb.plot_importance(clfs[i],max_num_features=50,ax=ax)

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv', index_col='TransactionID')
sample_submission['isFraud'] = prediction
sample_submission.to_csv('data/lightgbm_FE.csv')