In [1]:
import numpy as np
import pandas as pd
from scipy import interp
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score
import matplotlib.pyplot as plt
import xgboost as xgb
import pickle

In [2]:
# Load data from pickle files
train_df = pd.read_pickle('../../data/train.pkl')
test_df = pd.read_pickle('../../data/test.pkl')

#pull out y values
y_train = train_df['isFraud'].copy()
X_train = train_df.drop(columns=['isFraud'])


In [3]:
def dedupe_df(df):
    return df.loc[:,~df.columns.duplicated()]
X_train = dedupe_df(X_train)
y_train = dedupe_df(y_train)
test_df = dedupe_df(test_df)

In [4]:
print("X_train shape: ",X_train.shape)
X_train = X_train.loc[:,~X_train.columns.duplicated()]
print("X_train shape: ",X_train.shape)
'''
print("test_df shape: ",test_df.shape)
test_df = test_df.loc[:,~test_df.columns.duplicated()]
print("test_df shape: ",test_df.shape)'''

X_train shape:  (590540, 449)
X_train shape:  (590540, 449)


'\nprint("test_df shape: ",test_df.shape)\ntest_df = test_df.loc[:,~test_df.columns.duplicated()]\nprint("test_df shape: ",test_df.shape)'

In [5]:
#XGBoost model
clf = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=2,
    tree_method='gpu_hist'
)
'''clf = xgb.XGBClassifier(
    n_estimators=2000,
    max_depth=9,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=2019,
    #gamma = 1,
    tree_method='gpu_hist',
    #eval_method = 'auc',
    #scale_pos_weight = float(1/np.mean(y_train))
)
'''
#cv = StratifiedKFold(n_splits=5,shuffle=True)

# Grid Search
param_grid = {'n_estimators':[50,100], 
              'max_depth':[4,8],
              'learning_rate':[.005,.05]}

clf_grid = GridSearchCV(estimator = clf,
                        param_grid = param_grid,
                       n_jobs = -1,
                       cv=5)

In [6]:
clf_grid.fit(X_train,y_train)

MemoryError: Unable to allocate array with shape (212121519,) and data type float32

In [7]:
del [train_df]

GridSearchCV	 StratifiedKFold	 X_train	 accuracy_score	 auc	 clf	 clf_grid	 cross_val_score	 dedupe_df	 
interp	 np	 param_grid	 pd	 pickle	 plt	 preprocessing	 roc_auc_score	 roc_curve	 
test_df	 train_df	 xgb	 y_train	 


In [None]:
%%time
tprs = [] #roc curve translated to the 100 point 0-1 linspace
aucs = []
tprs_train = [] #roc curve translated to the 100 point 0-1 linspace
aucs_train = []
test_accuracy_scores = []
train_accuracy_scores = []
mean_fpr = np.linspace(0, 1, 1000)

# roccin function generates the data for the ROC curve.
def roccin(y_train,y_pred,mean_fpr,tprs,aucs):
    fpr, tpr, thresholds = roc_curve(y_train, y_pred) 
    tprs.append(interp(mean_fpr, fpr, tpr)) #Interpolates tpr at the mean_fpr (for ROC curve)
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    return fpr, tpr, tprs, roc_auc, aucs

i = 0
for train, test in cv.split(X_train, y_train):
    y_pred = clf.fit(X_train.iloc[train], y_train.iloc[train]).predict_proba(X_train.iloc[test])[:,1]
    y_pred_train = clf.predict_proba(X_train.iloc[train])[:,1]
    y_pred_binary = clf.predict(X_train.iloc[test])
    y_pred_train_binary = clf.predict(X_train.iloc[train])


    # Test AUC curve 
    fpr, tpr, tprs, roc_auc, aucs = roccin(y_train.iloc[test],y_pred,mean_fpr,tprs,aucs) 
    test_accuracy_scores.append(accuracy_score(y_train.iloc[test],y_pred_binary))
    
    #Train AUC curve
    fpr_train, tpr_train, tprs_train, roc_auc_train, aucs_train = roccin(y_train.iloc[train],
                                                                         y_pred_train,mean_fpr,
                                                                         tprs_train,aucs_train)    
    train_accuracy_scores.append(accuracy_score(y_train.iloc[train],y_pred_train_binary))

    #Print the ROC plot
    print("Fold {} complete.".format(i))    
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
clf.fit(X_train, y_train)
#f_names = X_train.columns
#test_df = test_df[f_names]
y_submit = clf.predict_proba(test_df)
print(y_submit)

In [None]:
# Sample submission
sample_submission = pd.read_csv('../../data/sample_submission.csv', index_col='TransactionID')
sample_submission['isFraud'] = y_submit[:,1]
sample_submission.to_csv('../../submissions/xgboost_40s_params_08_12_19.csv')


In [None]:
# save the model to disk
filename = '../../runs/xgb_encodings_40s_params_08_12_19'
pickle.dump(clf, open(filename+'.model', 'wb'))
auc_file = filename + '.results'
file2 = open(auc_file,'w')
for line in ["mean_auc_score: "+str(mean_auc),"\nstd_auc_score: "+str(std_auc)]:
    file2.writelines(line)
file2.close()