In [1]:
MAX_ROUNDS = 1000
OPTIMIZE_ROUNDS = True
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50  
# Note: I set EARLY_STOPPING_ROUNDS high so that (when OPTIMIZE_ROUNDS is set)
#       I will get lots of information to make my own judgment.  You should probably
#       reduce EARLY_STOPPING_ROUNDS if you want to do actual early stopping.

In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc



In [3]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / np.float(ntrue * (n - ntrue))
    return gini

In [4]:
# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [18]:
# Read data
train_df = pd.read_csv('Data/train.csv', na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv('Data/test.csv', na_values="-1")

In [19]:
# from olivier
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [20]:
#train_df.isnull().sum(axis=0)

In [21]:
# replace missing values with median
columns = ["ps_reg_03", "ps_car_11", "ps_car_12", "ps_car_14"]
#train_df[columns] = train_df[columns].replace(-1, np.NaN)
#test_df[columns] = test_df[columns].replace(-1, np.NaN)
for col in columns:
    med = train_df[col].dropna().median()
    train_df.loc[train_df[col].isnull(), col] = med
    test_df.loc[test_df[col].isnull(), col] = med

In [22]:
from sklearn.preprocessing import LabelEncoder
columns = [f for f in train_df.columns if "_cat" in f]

for col in columns:
    enc = LabelEncoder()
    train_df.loc[train_df[col].isnull(), col] = -1
    test_df.loc[test_df[col].isnull(), col] = -1
    data = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    enc.fit(data[col])
    train_df[col] = enc.transform(train_df[col])
    test_df[col] = enc.transform(test_df[col])
del data
gc.collect()

172

In [23]:
threshol = 500

columns = [f for f in train_features if "_cat" in f]
for col in columns:
    temp = train_df[col]
    density = temp.value_counts().sort_values()
    count = density.iloc[0]
    while count < threshol:
        ix0 = density.index[0]; ix1 = density.index[1]
        train_df.loc[train_df[col] == ix0, col] = ix1
        test_df.loc[test_df[col] == ix0, col] = ix1

        temp = train_df[col]
        density = temp.value_counts().sort_values()
        count = density.iloc[0]

In [24]:
# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f', (name1, n_c + 1, (time.time() - start) / 60))

    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]

('current feature %60s %4d in %5.1f', ('ps_reg_01_plus_ps_car_02_cat', 1, 4.9996376037597654e-05))
('current feature %60s %4d in %5.1f', ('ps_reg_01_plus_ps_car_04_cat', 2, 0.15304999748865764))


In [25]:
data = pd.concat([X, test_df], axis=0).reset_index(drop=True)

In [26]:
for col in f_cats:
    dummy = pd.get_dummies(data[col].astype('category'))
    columns = dummy.columns.astype(str).tolist()
    columns = [col + '_' + w for w in columns]
    dummy.columns = columns
    data = pd.concat((data, dummy), axis=1)
#    data = data.drop(col, axis=1)

In [27]:
X = data.loc[:X.shape[0]-1, :]
test_df = data.loc[X.shape[0]:, :]

In [28]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

In [38]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=0.5)

In [65]:
from sklearn.linear_model import Lasso
model = Lasso(alpha = 0.1, normalize = True)

In [66]:
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
#    Enocode data
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    model.fit(X_train.drop(f_cats, axis=1), y_train)
    
    pred = model.predict(X_valid.drop(f_cats, axis=1))
    print( "  Gini = ", eval_gini(y_valid, pred) )
    
    

('\nFold ', 0)
('  Gini = ', 0.007298961510530111)


KeyboardInterrupt: 

In [44]:
pred = model.predict(X_valid.drop(f_cats, axis=1))

In [59]:
np.max(pred)

0.036457644239755213

In [60]:
np.min(pred)

0.036457644239755213

In [46]:
print( "  Gini = ", eval_gini(y_valid, pred) )

('  Gini = ', 0.007298961510530111)


In [14]:
data.shape

(1488028, 193)

In [57]:
from sklearn.decomposition import PCA, FastICA
n_comp = 2

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca_train = pca.fit_transform(data.loc[:X.shape[0]-1, :])
pca_test = pca.transform(data.loc[X.shape[0]:, :])

In [58]:
for i in range(1, n_comp + 1):
    X['pca_' + str(i)] = pca_train[:, i - 1]
    test_df['pca_' + str(i)] = pca_test[:, i - 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [59]:

[f for f in train_features if "_bin" not in f and "_cat" not in f]

['ps_car_13',
 'ps_reg_03',
 'ps_ind_03',
 'ps_ind_15',
 'ps_reg_02',
 'ps_car_14',
 'ps_car_12',
 'ps_reg_01',
 'ps_car_15',
 'ps_ind_01',
 'ps_car_11',
 'ps_calc_09',
 'ps_calc_05',
 'ps_ind_14']

In [60]:
# threshol = 00

# columns = [f for f in train_features if "_cat" in f]
# for col in columns:
#     temp = train_df[col]
#     density = temp.value_counts().sort_values()
#     count = density.iloc[0]
#     while count < threshol:
#         ix0 = density.index[0]; ix1 = density.index[1]
#         train_df.loc[train_df[col] == ix0, col] = ix1
#         test_df.loc[test_df[col] == ix0, col] = ix1

#         temp = train_df[col]
#         density = temp.value_counts().sort_values()
#         count = density.iloc[0]

In [61]:
y_valid_pred = 0*y
y_test_pred = 0

In [62]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

In [63]:
# Set up classifier
model = XGBClassifier(    
                        n_estimators=MAX_ROUNDS,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=LEARNING_RATE, 
                        subsample=.8,
                        min_child_weight=.77,
                        colsample_bytree=.8,
                        scale_pos_weight=1.6,
                        gamma=10,
                        reg_alpha=8,
                        reg_lambda=1.3,
                     )

In [64]:
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
#    Enocode data
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)

('\nFold ', 0)
('  Best N trees = ', 280)
('  Best gini = ', -0.285412)
('  Gini = ', 0.2849827743526754)
('\nFold ', 1)


KeyboardInterrupt: 

In [127]:
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
X_test = test_df.copy()

In [128]:
                            #    Enocode data
                            for f in f_cats:
                                X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0)

In [None]:
nround = 2000
col = ["rate", "depth", "colsample", "child", "weight", "l1", "l2", "score"]
result = pd.DataFrame(columns=col)

for rate in [0.07]:
    for depth in [4]:
        for colsample in [0.6, 0.7, 0.8]:
            for child in [0.7, 0.75, 0.8]:
                for weight in [4]:
                    for L1 in [5, 6, 7, 8, 9]:
                        for L2 in [1, 1.5, 2]:
                            model = XGBClassifier(
                                            n_estimators=1000,
                                            learning_rate = rate, 
                                            max_depth = depth, 
                                            subsample = 0.8, 
                                            colsample_bytree = colsample, 
                                            min_child_weight = child,
                                            reg_lambda = L2,
                                            reg_alpha = L1,
                                            scale_pos_weight = weight,
                                            gamma=10
                                            )

                            eval_set=[(X_valid,y_valid)]
                            fit_model = model.fit( X_train, y_train, 
                                           eval_set=eval_set,
                                           eval_metric=gini_xgb,
                                           early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                                           verbose=False
                                             )
                            print( "  Best N trees = ", model.best_ntree_limit )
                            print( "  Best gini = ", model.best_score )
   
                            
                            result = result.append(pd.DataFrame([[rate, depth, colsample,
                                                                  child, weight, L1, L2, model.best_score]], columns=col))
                            print rate, depth, colsample, child, weight, L1, L2, model.best_score

In [18]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 44)

In [19]:
y_valid_pred = 0*y
y_test_pred = 0

In [21]:
for i, (train_index, test_index) in enumerate(skf.split(train_df, y)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
    # Enocode data
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)

('\nFold ', 0)
('  Gini = ', 0.29105970879009646)
('\nFold ', 1)
('  Gini = ', 0.2857548534153429)
('\nFold ', 2)
('  Gini = ', 0.27806441549074434)
('\nFold ', 3)
('  Gini = ', 0.27359508268171495)
('\nFold ', 4)
('  Gini = ', 0.303203124382325)

Gini for full training set:


0.2862092974058884

In [22]:
K = 10
skf = StratifiedKFold(n_splits = K, shuffle = True, random_state = 44)
y_valid_pred = 0*y
y_test_pred = 0

In [23]:
for i, (train_index, test_index) in enumerate(skf.split(train_df, y)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
    # Enocode data
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)

('\nFold ', 0)
('  Gini = ', 0.30065892703672303)
('\nFold ', 1)
('  Gini = ', 0.2865388484597984)
('\nFold ', 2)
('  Gini = ', 0.29557611078934964)
('\nFold ', 3)
('  Gini = ', 0.2749834316080565)
('\nFold ', 4)
('  Gini = ', 0.27815356009086045)
('\nFold ', 5)
('  Gini = ', 0.273529217320026)
('\nFold ', 6)
('  Gini = ', 0.2735075929153241)
('\nFold ', 7)
('  Gini = ', 0.27845747542326116)
('\nFold ', 8)
('  Gini = ', 0.3129000529356971)
('\nFold ', 9)
('  Gini = ', 0.29299594461383727)

Gini for full training set:


0.28664912799634756

In [27]:
for i, (train_index, test_index) in enumerate(skf.split(train_df, y)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
    # Enocode data
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)

('\nFold ', 0)
('  Best N trees = ', 373)
('  Best gini = ', -0.300782)
('  Gini = ', 0.3003542839658463)
('\nFold ', 1)
('  Best N trees = ', 466)
('  Best gini = ', -0.287845)
('  Gini = ', 0.28728224054798146)
('\nFold ', 2)
('  Best N trees = ', 558)
('  Best gini = ', -0.296807)
('  Gini = ', 0.2959720808936068)
('\nFold ', 3)
('  Best N trees = ', 464)
('  Best gini = ', -0.276277)
('  Gini = ', 0.27583005875913513)
('\nFold ', 4)
('  Best N trees = ', 325)
('  Best gini = ', -0.278552)
('  Gini = ', 0.2782733062367484)
('\nFold ', 5)
('  Best N trees = ', 230)
('  Best gini = ', -0.274973)
('  Gini = ', 0.273671214897964)
('\nFold ', 6)
('  Best N trees = ', 452)
('  Best gini = ', -0.274113)
('  Gini = ', 0.27385768318475356)
('\nFold ', 7)
('  Best N trees = ', 363)
('  Best gini = ', -0.278689)
('  Gini = ', 0.2784367835207695)
('\nFold ', 8)
('  Best N trees = ', 265)
('  Best gini = ', -0.313265)
('  Gini = ', 0.31237338901304645)
('\nFold ', 9)
('  Best N trees = ', 268)
(

0.2868601965716635

In [None]:
from Tkinter import *