In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
import warnings
import xgboost as xgb
warnings.filterwarnings('ignore')

In [2]:
df_train=pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')
df_test=pd.read_csv('../input/santander-customer-transaction-prediction/test.csv')
df_sample=pd.read_csv('../input/santander-customer-transaction-prediction/sample_submission.csv')
synthetic = np.load('../input/list-of-fake-samples-and-public-private-lb-split/synthetic_samples_indexes.npy')
test_real = df_test.iloc[~df_test.index.isin(synthetic)]

In [3]:
print(df_train.shape)
print(df_test.shape)

(200000, 202)
(200000, 201)


In [4]:
prediction =pd.DataFrame()
oof=pd.DataFrame()

prediction['Id']=df_test['ID_code']
prediction['predict']=0

oof['Id']=df_train['ID_code']
oof['predict']=0


df_train.drop(['ID_code'], axis=1, inplace=True)
df_test.drop(['ID_code'], axis=1, inplace=True)
test_real.drop(['ID_code'], axis=1, inplace=True)

In [5]:
target=df_train['target']
df_train.drop(['target'], axis=1, inplace=True)

In [6]:
print(df_train.shape)
print(df_test.shape)

(200000, 200)
(200000, 200)


In [7]:
def add_fe(col, dfr,dfo):
    vc=dfr[col].value_counts(dropna=False)
    dfo[col+'_fe']=dfo[col].map(vc)/vc.max()
    return dfo

In [8]:
cs=[col for col in df_train.columns if col not in ['ID_code','target']]
train_index=df_train.shape[0]
alls=pd.concat([df_train,test_real])
allo=pd.concat([df_train,df_test])
cols=alls.columns
for col in cs:
    add_fe(col,alls,allo)
df_train=allo[:train_index]
df_test=allo[train_index:]

print(df_train.shape)
print(df_test.shape)

(200000, 400)
(200000, 400)


In [9]:
def mult_with_fe(col,df):
    df[col+'_mult']=df[col]*df[col+'_fe']
    return df

for c in cs:
    df_train=mult_with_fe(c,df_train)
    df_test=mult_with_fe(c,df_test)

print(df_train.shape)
print(df_test.shape)

(200000, 600)
(200000, 600)


In [10]:
del test_real,synthetic

In [11]:
params={
    'objective':"binary:logistic", 
    'eval_metric':'auc',
    'booster':"gbtree",
    'tree_method':'gpu_hist',
    'eta':0.01,
    'min_child_weight':50,
    'subsample':0.9,
    'max_depth':3,
    'colsample_bytree':0.7,
    'silent':True
}

In [12]:
folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [13]:
import gc
cols=[col for col in df_train.columns if col not in ['ID_code','target']]

In [14]:
for fold_, (trn_, val_) in enumerate(folds.split(df_train, target)):
    
    print('Fold {}'.format(fold_))
    train_data=xgb.DMatrix(df_train.iloc[trn_][cols], label=target.iloc[trn_])
    val_data=xgb.DMatrix(df_train.iloc[val_][cols], label=target.iloc[val_])
    watchlist=[(train_data, 'train'), (val_data, 'val')]

    num_round=100000
    clfxg=xgb.train(params, train_data, num_round, watchlist, early_stopping_rounds=500, verbose_eval=200)
    oof['predict'][val_]=clfxg.predict(xgb.DMatrix(df_train.iloc[val_][cols]), ntree_limit=clfxg.best_ntree_limit)
        
    prediction['predict'] +=clfxg.predict(xgb.DMatrix(df_test[cols]), ntree_limit=clfxg.best_ntree_limit)/folds.n_splits
    gc.collect()
print(roc_auc_score(target,oof['predict']))

Fold 0
[0]	train-auc:0.591493	val-auc:0.591579
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 500 rounds.
[200]	train-auc:0.750525	val-auc:0.735907
[400]	train-auc:0.812339	val-auc:0.791587
[600]	train-auc:0.842467	val-auc:0.81824
[800]	train-auc:0.861318	val-auc:0.835025
[1000]	train-auc:0.874564	val-auc:0.84745
[1200]	train-auc:0.884532	val-auc:0.8564
[1400]	train-auc:0.892292	val-auc:0.863204
[1600]	train-auc:0.898698	val-auc:0.86869
[1800]	train-auc:0.904039	val-auc:0.873288
[2000]	train-auc:0.908661	val-auc:0.877159
[2200]	train-auc:0.912525	val-auc:0.880486
[2400]	train-auc:0.915926	val-auc:0.883342
[2600]	train-auc:0.919013	val-auc:0.885803
[2800]	train-auc:0.921652	val-auc:0.887923
[3000]	train-auc:0.923973	val-auc:0.889852
[3200]	train-auc:0.926028	val-auc:0.891506
[3400]	train-auc:0.928028	val-auc:0.893223
[3600]	train-auc:0.929882	val-auc:0.894682
[3800]	train-auc:0.931535	val-auc:0.895995
[4000

[15400]	train-auc:0.966942	val-auc:0.912759
[15600]	train-auc:0.967269	val-auc:0.912748
[15800]	train-auc:0.967587	val-auc:0.912765
[16000]	train-auc:0.967893	val-auc:0.912782
[16200]	train-auc:0.96821	val-auc:0.912785
[16400]	train-auc:0.968522	val-auc:0.912821
[16600]	train-auc:0.968829	val-auc:0.912832
[16800]	train-auc:0.969125	val-auc:0.912842
[17000]	train-auc:0.969431	val-auc:0.912857
[17200]	train-auc:0.969708	val-auc:0.912873
[17400]	train-auc:0.970028	val-auc:0.912874
[17600]	train-auc:0.970327	val-auc:0.912924
[17800]	train-auc:0.970618	val-auc:0.91297
[18000]	train-auc:0.970916	val-auc:0.912985
[18200]	train-auc:0.971191	val-auc:0.913002
[18400]	train-auc:0.971478	val-auc:0.913027
[18600]	train-auc:0.971764	val-auc:0.913042
[18800]	train-auc:0.972043	val-auc:0.913079
[19000]	train-auc:0.972297	val-auc:0.913077
[19200]	train-auc:0.97257	val-auc:0.913099
[19400]	train-auc:0.972858	val-auc:0.913058
[19600]	train-auc:0.973119	val-auc:0.913061
Stopping. Best iteration:
[19209]	t

[12200]	train-auc:0.960694	val-auc:0.917605
[12400]	train-auc:0.961085	val-auc:0.917658
[12600]	train-auc:0.961438	val-auc:0.917739
[12800]	train-auc:0.961791	val-auc:0.917781
[13000]	train-auc:0.962172	val-auc:0.917851
[13200]	train-auc:0.962514	val-auc:0.917884
[13400]	train-auc:0.962866	val-auc:0.917906
[13600]	train-auc:0.963223	val-auc:0.917945
[13800]	train-auc:0.963567	val-auc:0.917967
[14000]	train-auc:0.963904	val-auc:0.918017
[14200]	train-auc:0.964244	val-auc:0.918044
[14400]	train-auc:0.964582	val-auc:0.918052
[14600]	train-auc:0.964929	val-auc:0.918103
[14800]	train-auc:0.96525	val-auc:0.918128
[15000]	train-auc:0.965569	val-auc:0.91814
[15200]	train-auc:0.965886	val-auc:0.918181
[15400]	train-auc:0.966202	val-auc:0.918224
[15600]	train-auc:0.966513	val-auc:0.91826
[15800]	train-auc:0.966825	val-auc:0.918294
[16000]	train-auc:0.967134	val-auc:0.918308
[16200]	train-auc:0.967439	val-auc:0.918338
[16400]	train-auc:0.967733	val-auc:0.918359
[16600]	train-auc:0.968038	val-auc:

In [15]:
df_sample['target']=prediction['predict']
df_sample.to_csv('subtryXGB10.csv',index=False)
oof.to_csv('oof.csv',index=False)