## Cross Validation Test

In [14]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import datetime
import missingno as msno
import lightgbm as lgb
import xgboost as xgb
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import KFold, TimeSeriesSplit, train_test_split,StratifiedKFold
import gc
from statistics import mean

# Any results you write to the current directory are saved as output.

### Setup Cross Validation
1. Divide Train set in subsets (Training set itself + Local Test set)
2. Define Validation Metric (in our case it is ROC-AUC)
3. Stop training when Validation metric stops improving
4. Take average of each fold's prediction for the Local Test set.

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [7]:
train_full = pd.read_pickle('data/train_full.pkl')
test_full = pd.read_pickle('data/test_full.pkl')

train_full = train_full.fillna(-999)
test_full = test_full.fillna(-999)

Mem. usage decreased to 650.48 Mb (66.6% reduction)
Mem. usage decreased to 565.37 Mb (66.2% reduction)


In [10]:
# Label Encoding
for f in test_full.columns:
    if train_full[f].dtype=='object' or test_full[f].dtype=='object': 
        train_full[f] = train_full[f].fillna('unseen_before_label')
        test_full[f]  = test_full[f].fillna('unseen_before_label')
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_full[f].values) + list(test_full[f].values))
        train_full[f] = lbl.transform(list(train_full[f].values))
        test_full[f] = lbl.transform(list(test_full[f].values)) 

In [None]:
# train_full.to_pickle('train_full_min.pkl')
# test_full.to_pickle('test_full_min.pkl')

In [11]:
rm_cols = [
    'TransactionID','TransactionDT', # These columns are pure noise right now
    'isFraud',                          # Not target in features))
    'DT_M'                           # Column that we used to simulate test set
]

# Final features
features_columns = [col for col in list(train_full.columns) if col not in rm_cols]

In [12]:
X = train_full[features_columns]
y = train_full['isFraud']

X_test = xgb.DMatrix(test_full[features_columns])
pred_len = len(test_full)
del train_full,test_full
gc.collect()

258

In [13]:
params = { 'n_estimators':500,
           'max_depth':9,
           'learning_rate':0.05,
           'subsample':0.9,
           'colsample_bytree':0.9,
           'missing':-999,
           'random_state':2019,
           'tree_method':'gpu_hist', #'gpu_exact', # THE MAGICAL PARAMETER
           'eval_metric':'auc'}


In [None]:
NFOLDS =5
folds = StratifiedKFold(n_splits=NFOLDS,random_state=123,shuffle=True)

aucs = []
clfs=[]
prediction = np.zeros(pred_len)

for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
    print('Training on fold {}'.format(fold + 1))
    
    trn_data = xgb.DMatrix(data=X.iloc[trn_idx], label=y.iloc[trn_idx])
    val_data = xgb.DMatrix(data=X.iloc[test_idx], label=y.iloc[test_idx])
    clf = xgb.train(params, trn_data, num_boost_round= 10000, evals = [(trn_data,'train'), (val_data,'valid')], verbose_eval=1000, early_stopping_rounds=500)
    
    print('AUC for fold {}: {}'.format(fold+1, clf.best_score))
    
    aucs.append(clf.best_score)
#     clfs.append(clf)
    
    prediction += clf.predict(X_test)

print("Cross Validation AUC: ", sum(aucs)/NFOLDS)
final_predictions = prediction/NFOLDS


Training on fold 1


  if getattr(data, 'base', None) is not None and \


[0]	train-auc:0.810665	valid-auc:0.808429
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 500 rounds.
[1000]	train-auc:0.987365	valid-auc:0.954668
[2000]	train-auc:0.996761	valid-auc:0.962394
[3000]	train-auc:0.999238	valid-auc:0.962218
Stopping. Best iteration:
[2504]	train-auc:0.99835	valid-auc:0.963055



In [None]:
fig, ax = plt.subplots(figsize=(15, 20))
xgb.plot_importance(clfs[0],max_num_features=50,ax=ax)
# for i in range(NFOLDS):
#     fig, ax = plt.subplots(figsize=(15, 20))
#     xgb.plot_importance(clfs[i],max_num_features=50,ax=ax)

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv', index_col='TransactionID')
sample_submission['isFraud'] = prediction
sample_submission.to_csv('data/xgboost_cv.csv')

### Feature Engineering
- build time of day/ week, month features (D9 is already a time of day feature, but because it has alot of NAs, better to create using the timedelta variable)
- Hour and TransactionPerHour

In [None]:
# # Time dependent features
# # https://www.kaggle.com/fchmiel/day-and-time-powerful-predictive-feature
# train_full['Transaction_day_of_week'] = np.floor((train_full['TransactionDT'] / (3600 * 24) - 1) % 7)
# test_full['Transaction_day_of_week'] = np.floor((test_full['TransactionDT'] / (3600 * 24) - 1) % 7)
# train_full['Transaction_hour'] = np.floor(train_full['TransactionDT'] / 3600) % 24
# test_full['Transaction_hour'] = np.floor(test_full['TransactionDT'] / 3600) % 24