In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder
from xgboost import XGBRegressor

In [50]:
train = pd.read_csv('data/challenge3_train.csv', index_col='id')
test = pd.read_csv('data/challenge3_test.csv', index_col='id')
full_data = pd.concat([train, test]).drop(columns=['target'])

In [51]:
features_bin = ['f0', 'f26'] # binary
features_nom_low = ['f5'] # nominal low cardinality (<=3)
features_nom_high = ['f12', 'f28'] # >= 26
features_ord_low = ['f1', 'f2', 'f3', 'f6', 'f9', 'f11', 'f13', 'f15', 'f17', 'f18', 'f19', 'f20', 'f21', 'f23', 'f24', 'f25', 'f27', 'f29']
features_ord_num = ['f1', 'f2', 'f3', 'f6', 'f9', 'f11', 'f13', 'f17', 'f18', 'f19', 'f21', 'f23', 'f24', 'f25', 'f27', 'f29']
features_ord_high = ['f4', 'f10', 'f14']
features_real = ['f7', 'f8', 'f16', 'f22', 'f30'] # real numbers.


features_ord_alph = ['f15', 'f20']
ohe_columns = features_ord_alph + features_nom_low

In [52]:
# Cleanup. This is doing data leakage though, when using information from test set..
# Fix 0 value noise, change to most common value in column. 
impute_0_columns = ['f3', 'f18', 'f21']

for column in impute_0_columns:
    train.loc[train[column] == 0, column] = train[column].mode() # mode() finds most common value
    test.loc[test[column] == 0, column] = test[column].mode()

# remove -1 from 'month' column f11
train.loc[train['f11'] == -1, 'f11'] = train['f11'].mode()
test.loc[test['f11'] == -1, 'f11'] = test['f11'].mode()

In [53]:
# Encoding
# Alphabetic -> numbers (a=1, b=2 etc). f15, f20
for col in features_ord_alph:
    ord_order_dict = {i : j for j, i in enumerate(sorted(list(set(list(train[col].dropna().unique()) + list(test[col].dropna().unique())))))}
    train[f'{col}_en'] = train[col].fillna('NULL').map(ord_order_dict)
    test[f'{col}_en'] = test[col].fillna('NULL').map(ord_order_dict)

In [54]:
# One hot encoding
train = pd.get_dummies(train, columns=ohe_columns + features_nom_high)
test = pd.get_dummies(test, columns=ohe_columns)

In [55]:
# add cyclical feature (f11 - month)
def cyc_enc(df, col, max_vals):
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_vals)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_vals)
    return df
train = cyc_enc(train, 'f11', 11)
test = cyc_enc(test, 'f11', 11)

In [56]:
# split train set into a train and test set
X = train.drop(['target'], axis=1)
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [57]:
%%time

my_model = XGBRegressor()

# fit new df to model
my_model.fit(X_train, y_train, verbose=False)
# generate predictions
predictions = my_model.predict(X_test)
#calculate score
base_score = roc_auc_score(y_test, predictions)


CPU times: user 2min 20s, sys: 553 ms, total: 2min 20s
Wall time: 43.5 s


In [58]:
print(base_score)
# 0.9047146110315014 - label encoding: f12, f28 , ohe: f5, f15, f20
# 0.911410624661415 - ohe: f12, f28, f5, f15, f20
# 0.9109295163814883 - ohe ^ + ord alph
# 0.905340674295565 - ohe ^ + fiks 0
# 0.8991073318616627 - ohe ^ + cyclical
# 0.9119622346380591 - ohe ^ + fiks 0 + cyclical <-- best
# 0.9058041067624335 - ohe ^ + alpha + fiks 0 + cyclical

0.9058041067624335


21271

In [47]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from xgboost import plot_importance
from sklearn.metrics import make_scorer
from hyperopt import fmin, hp, tpe
import gc

import time
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 10
    count=1
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

    # tss = TimeSeriesSplit(n_splits=FOLDS)
    y_preds = np.zeros(test.shape[0])
    # y_oof = np.zeros(X_train.shape[0])
    score_mean = 0
    scal_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
    for tr_idx, val_idx in kf.split(X_train, y_train):
        clf = XGBRegressor(
            n_estimators=100, random_state=4, 
            scale_pos_weight=scal_pos_weight,
            **params
        )

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        #y_pred_train = clf.predict_proba(X_vl)[:,1]
        #print(y_pred_train)
        predictions = clf.predict(X_vl)
        #calculate score
        score = roc_auc_score(y_vl, predictions)
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    
    return -(score_mean / FOLDS)

space = {
    # The maximum depth of a tree, same as GBM.
    # Used to control over-fitting as higher depth will allow model 
    # to learn relations very specific to a particular sample.
    # Should be tuned using CV.
    # Typical values: 3-10
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    
    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
    # (meaning pulling weights to 0). It can be more useful when the objective
    # is logistic regression since you might need help with feature selection.
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
    # approach can be more useful in tree-models where zeroing 
    # features might not make much sense.
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
    # eta: Analogous to learning rate in GBM
    # Makes the model more robust by shrinking the weights on each step
    # Typical final values to be used: 0.01-0.2
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.15),
    
    # colsample_bytree: Similar to max_features in GBM. Denotes the 
    # fraction of columns to be randomly samples for each tree.
    # Typical values: 0.5-1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    
    # A node is split only when the resulting split gives a positive
    # reduction in the loss function. Gamma specifies the 
    # minimum loss reduction required to make a split.
    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
    'gamma': hp.uniform('gamma', 0.01, .7),
    
    # specifies the minimum samples per leaf node.
    # the minimum number of samples (data) to group into a leaf. 
    # The parameter can greatly assist with overfitting: larger sample
    # sizes per leaf will reduce overfitting (but may lead to under-fitting).
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
    # subsample: represents a fraction of the rows (observations) to be 
    # considered when building each subtree. Tianqi Chen and Carlos Guestrin
    # in their paper A Scalable Tree Boosting System recommend 
    'subsample': hp.choice('subsample', [.5, 0.6, 0.7, .8]),

}

In [48]:
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30, 
            # trials=trials
           )

                                                      
############## New Run ################
params = {'max_depth': 6, 'gamma': '0.485', 'subsample': '0.80', 'reg_alpha': '0.226', 'reg_lambda': '0.284', 'learning_rate': '0.054', 'colsample_bytree': '0.508'}
1 CV - score: 0.9116                                  
2 CV - score: 0.9016                                  
3 CV - score: 0.9109                                  
4 CV - score: 0.907                                   
5 CV - score: 0.9222                                  
6 CV - score: 0.9197                                  
7 CV - score: 0.9078                                  
8 CV - score: 0.9004                                  
9 CV - score: 0.91                                    
10 CV - score: 0.9108                                 
Total Time Run: 4.14                                  
Mean ROC_AUC: 0.9101909427861974                      
                                                                                

In [49]:
best

{'colsample_bytree': 0.7048862950970731,
 'gamma': 0.6870786593228237,
 'learning_rate': 0.043037814961976235,
 'max_depth': 8.0,
 'min_child_samples': 4,
 'reg_alpha': 0.12849120484354531,
 'reg_lambda': 0.21891264474184496,
 'subsample': 2}

In [None]:
# Calculate ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def print_roc_curve(test_y, predictions):
    fpr, tpr, _ = roc_curve(test_y, predictions)
    roc_auc = auc(fpr, tpr)
    #xgb.plot_importance(gbm)
    #plt.show()
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([-0.02, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
print_roc_curve(y_test, predictions)

In [None]:
%%time 
# now lets try tuning xgboost by letting it stop learning when the validation score stops improving, 
# and letting it cycle through the learning process until it does so

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)

# fit new df to model
my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
# generate predictions
predictions = my_model.predict(X_test)
#calculate score
tuned_score = roc_auc_score(y_test, predictions)
print(tuned_score)

In [None]:
# we might have overfitted the model, and as a result we need to do cross validation
# lets plot the roc curve again
print_roc_curve(y_test, predictions)