In [1]:
# %matplotlib inline

import pandas as pd
import numpy as np
import catboost as cb
import warnings

from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt

from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# from bayes_opt import BayesianOptimization
# from bayes_opt.observer import JSONLogger
# from bayes_opt.event import Events
# from bayes_opt.util import load_logs

input_path = "../input/"
output_path = "../output/"

In [2]:
cb.__version__

'0.13.1'

In [None]:
train_df = pd.read_csv(input_path+'train.csv.zip')

label = train_df.target
train = train_df.drop(['ID_code','target'],axis=1)

test = pd.read_csv(input_path+'test.csv.zip')
test = test.drop(['ID_code'],axis=1)

test_filtered = pd.read_pickle(input_path+'test_filtered.pkl')
test_filtered = test_filtered.loc[:,train.columns]

train_test = pd.concat([train,test_filtered]).reset_index(drop=True)

In [None]:
vcs_train = {}
vcs_test = {}
vcs_train_test = {}

for col in tqdm(train.columns):
    vcs_train_test[col] = train_test.loc[:,col].value_counts() #/300000

In [None]:
def feature_generator(df):
    for i in tqdm(range(200)):
        col = "var_"+str(i)
        vtraintest = vcs_train_test[col]
        t = vtraintest[df[col]].fillna(0).values

        df[col+'_train_test_sum_vcs'] = t
        df[col+'_train_test_sum_vcs_prod'] = df[col]*t
#         df[col+'_train_test_sum_vcs_sign'] = (df[col+"_train_test_sum_vcs_prod"]>0).astype(int)
#         df[col+'_train_test_sum_vcs_div'] = df[col]/t
        df[col+'_train_test_sum_vcs_minus'] = scale(df[col]) - scale(t)
        df[col+'_train_test_sum_vcs_plus'] = scale(df[col]) + scale(t)
#         df[col+'_train_test_sum_vcs_min'] = np.min(scale(df[col]), scale(t))
#         df[col+'_train_test_sum_vcs_max'] = np.max(scale(df[col]), scale(t))
#         df[col+'_train_test_sum_vcs_pow'] = np.power(abs(scale(df[col])), abs(scale(t)))
#         df[col+'_train_test_sum_vcs_log'] = np.log(abs(scale(df[col])), abs(scale(t)))
        
feature_generator(train)
feature_generator(test)

In [None]:
params = {
    'task_type': 'GPU',
    'iterations': 20000,
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'random_seed': 4242,
    "learning_rate": 0.03,
    "l2_leaf_reg": 3.0,
    'bagging_temperature': 1,
    'random_strength': 1,
    'depth': 4,
    'border_count':128}

folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=99999)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
gains = []

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, label.values)):
    print("Fold {}".format(fold_))
    trn_data = cb.Pool(train.iloc[trn_idx], label=label.iloc[trn_idx])
    val_data = cb.Pool(train.iloc[val_idx], label=label.iloc[val_idx])
    clf = cb.CatBoostClassifier(**params)
    clf.fit(trn_data, eval_set = val_data,use_best_model = True,verbose=500,early_stopping_rounds=300)
    oof[val_idx] = clf.predict_proba(train.iloc[val_idx])[:,1]
    #predictions += clf.predict_proba(test)[:,1] / folds.n_splits
    #gains.append(clf.get_feature_importance(trn_data))
    break

print("CV score: {:<8.5f}".format(roc_auc_score(label, oof)))

In [None]:
# oof0 = oof[val_idx]
# print("CV score: {:<8.5f}".format(roc_auc_score(label[val_idx], oof0)))
# CV score: 0.91542 

In [None]:
# oof1 = oof[val_idx]
# print("CV score: {:<8.5f}".format(roc_auc_score(label[val_idx], oof1)))
# CV score: 0.91506 

In [None]:
# oof2 = oof[val_idx]
# print("CV score: {:<8.5f}".format(roc_auc_score(label[val_idx], oof2)))
# CV score: 0.91526  

In [None]:
# oof3 = oof[val_idx]
# print("CV score: {:<8.5f}".format(roc_auc_score(label[val_idx], oof3)))
# CV score: 0.91521

In [None]:
print("CV score: {:<8.5f}".format(roc_auc_score(label[val_idx], (oof0+oof1+oof2+oof3)/4)))

In [None]:
bestTest = 0.9145798087
bestTest = 0.9152134657

In [None]:
pd.options.display.max_rows = 200

In [None]:
pd.Series(np.array(gains).sum(0)/10,index=train.columns).sort_values(ascending=False).head(200)

In [None]:
t = pd.DataFrame(clf.get_feature_importance(trn_data,'Interaction'))
t[0] = train.columns.values[t[0]]
t[1] = train.columns.values[t[1]]

In [None]:
t.sort_values(2, ascending=False).head(100)

In [None]:
for col in train_test.columns:
    print(col, np.median(train_test[col]))

In [None]:
tst_sub = pd.read_csv(intput_path+'sample_submission.csv.zip')

In [None]:
tst_sub['target'] = predictions

In [None]:
tst_sub.to_csv(output_path+'sub18.csv', index=False)

In [None]:
pd.options.display.max_rows = 1000
pd.Series(clf.feature_importances_,index=clf.feature_names_).sort_values(ascending=False)