In [2]:
# import packages

# data processing
import pandas as pd
import numpy as np
from datetime import timedelta, datetime


import re

# data visualization
import plotly.graph_objs as go
from plotly.graph_objs import Bar, Layout
from plotly import offline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (20, 10)

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号

# change text color
import colorama
from colorama import Fore, Style

# IPython
from IPython.display import IFrame

from sklearn.feature_selection import mutual_info_classif

%matplotlib inline

## Evaluation Index

In [3]:
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

In [4]:
from sklearn.model_selection import KFold

K = 10
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(1996)

In [5]:
final_train = pd.read_csv('final_train.csv',index_col=0)
final_test = pd.read_csv('final_test.csv',index_col=0)

<span id="5"></span>In order to get results between 0 and 1, a function, which is called **sigmoid**, is used to transform our hypothesis function. It is defined as
$$ $$
$$h_{\theta}(x) = g(\theta^{T} x)$$ 
$$ $$
where $h_{\theta}(x)$ is the hypothesis function, $x$ is a single record and 
$$ $$
$$g(z)=\dfrac{1}{1+e^{-z}}$$
$$ $$
By using $g(\theta^{T} x)$, we obtain the probablity and if $h_{\theta}(x) \geq 0.5$, we get $y=1$; if $h_{\theta}(x) < 0.5$, we get $y=0$. Further, when $z \geq 0$, $g(z) \geq 0.5$ is another detail. Thus, if the $\theta^{T} x \geq 0$, then $y=1$.
 
By the definition, I defined the below ***sigmoid*** function.<span id="5"></span>

We can't use the same cost function that we use for linear regression because the Logistic Function will cause the output to be wavy, causing many local optima. In other words, it will not be a convex function. That's why we need to define a different cost function for logistic regression. It is simply defined as
$$ $$
$$J(\theta) = \dfrac{1}{m} \sum^{m}_{i=1}Cost(h_{\theta}(x^{(i)}), y^{(i)})$$ 
$$ $$
where 
$$ $$
$$Cost(h_{\theta}(x^{(i)}), y^{(i)})=-y^{(i)} \; log(h_{\theta}(x^{(i)}))-(1-y^{(i)}) \; log(1-h_{\theta}(x^{(i)}))$$
$$ $$
As the sanity check, $J(\theta)$ can be plotted or printed as a function of the number of iterations to be sure that $J(\theta)$ is **decreasing on every iteration**, which shows that it is converging correctly. At this point, choice of $\alpha$ is important. If we select a high or small $\alpha$ value, we might have problem about the converging.<span id="6"></span>



In [6]:
# prepare the data

# 1.Sigmoid function

def sigmoid(z):
    return 1 / (1 + np.exp(-z))
    
# 2. loss function 
def loss(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

### Parameter adjustment

- n_estimators: The number of trees or rounds. Adding more trees will be at the risk of overfitting. The reason is in the way that the boosted tree model is constructed, sequentially where each new tree attempts to model and correct for the errors made by the sequence of previous trees. Quickly, the model reaches a point of diminishing returns.

- max_depth: The maximum depth of a tree. It is also used to control overfitting as higher depth will allow model to learn relations very specific to a particular sample. Typically, it should be chosen from 3 to 10 and tuned using CV.

- objective: The loss function to be minimized. binary:logistic is for binary classification, which will return predicted probability (NOT CLASS).

- learning_rate: The convergence control parameter in gradient descent. It is intuitive that XGB will not reach its minimum if both n_estimaters and learning_rate are very small.

- subsample: The fraction of observations to be randomly chosen for each tree. Lower values make the algorithm more conservative and prevents overfitting, but too small values might lead to underfitting. So, be careful to choose and the typical values are between 0.5 and 1.

- min_child_weight: The minimum sum of weights all observations required in child. It is the minimum weight (or number of samples if all samples have a weight of 1) required in order to create a new node in the tree. A smaller min_child_weight allows the algorithm to create children that correspond to fewer samples, thus allowing for more complex trees, but again, more likely to overfit.

- colsample_bytree: The fraction of features to use. By default it is set to 1 meaning that we will use all features. But in order to avoid the number of highly correlated trees is getting too big, we would like to use a sample of all the features for training to avoid overfitting.

- scale_pos_weight: The parameter that controls the balance of positive and negative weights, useful for unbalanced classes. This dataset is unbalanced as we have seen, so we should be careful to tune it. The typical value to consider: sum(negative instances) / sum(positive instances).

- gamma: The minimum loss reduction required to make a split. A node is split only when the resulting split gives a positive reduction in the loss function. The larger gamma is, the more conservative (overfitting) the algorithm will be. The values can vary depending on the loss function and should be tuned.

- reg_alpha: L1 regularization term on weights. Increasing this value will make model more conservative.

- reg_lambda: L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples.

In [6]:
# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5, 10],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5]
#         }

In [7]:
# xgb = XGBClassifier(learning_rate=0.06, n_estimators=300, objective='binary:logistic',nthread=4)

In [8]:
# from datetime import datetime
# def timer(start_time=None):
#     if not start_time:
#         start_time = datetime.now()
#         return start_time
#     elif start_time:
#         thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
#         tmin, tsec = divmod(temp_sec, 60)
#         print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
        
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# from sklearn.model_selection import StratifiedKFold


# folds = 3
# param_comb = 5

# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

# random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )

# # Here we go
# start_time = timer(None)
# random_search.fit(X, y)
# timer(start_time) 

In [9]:
# print('\n All results:')
# print(random_search.cv_results_)
# print('\n Best estimator:')
# print(random_search.best_estimator_)
# print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
# print(random_search.best_score_ * 2 - 1)
# print('\n Best hyperparameters:')
# print(random_search.best_params_)
# results = pd.DataFrame(random_search.cv_results_)
# results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

### 优化迭代

In [7]:
# 调参之后，较优的参数组合

from xgboost import XGBClassifier
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50  

model = XGBClassifier(    
                        n_estimators=MAX_ROUNDS,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=LEARNING_RATE, 
                        subsample=.8,
                        min_child_weight=6,
                        colsample_bytree=.8,
                        scale_pos_weight=1.6,
                        gamma=10,
                        reg_alpha=8,
                        reg_lambda=1.3,
                     )

In [8]:
def XGB_gini(df_train,tar_enc = True,pca = False):
    
    '''
    df_train: 已处理的训练集数据
    tar_enc: 是否对类别型变量使用target encoding
    pca: 是否使用pca
    '''    
    
    y = df_train.target
    X = df_train.drop('target',axis=1)
    
    
    y_valid_pred = 0*y
    y_test_pred = 0
    
    
    from target_encoding import target_encode
    
    train = pd.concat([X,y],axis=1)
    for i, (train_index, test_index) in enumerate(kf.split(train)):

        # 分成训练集、验证集、测试集

        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()        
        X_test = final_test.copy()
        
        
        if pca == True:
            n_comp = 20
            print('\nPCA执行中...')
            pca = PCA(n_components=n_comp, svd_solver='full', random_state=1001)
            X_train = pd.DataFrame(pca.fit_transform(X_train))
            X_valid = pd.DataFrame(pca.transform(X_valid))
            X_test = pd.DataFrame(pca.transform(final_test.copy()))
        print( f"\n{i}折交叉验证： ")
        
        if pca == False:
            if tar_enc == True:
                f_cat = [f for f in X.columns if '_cat' in f and 'tar_enc' not in  f]
                for f in f_cat:
                    X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                                    trn_series=X_train[f],
                                                                    val_series=X_valid[f],
                                                                    tst_series=X_test[f],
                                                                    target=y_train,
                                                                    min_samples_leaf=100,
                                                                    smoothing=10,
                                                                    noise_level=0
                                                                    )

    #     from category_encoders.target_encoder import TargetEncoder
    #     tar_enc = TargetEncoder(cols = f_cat).fit(X_train,y_train)
    #     X_train = tar_enc.transform(X_train) # 转换训练集
    #     X_test = tar_enc.transform(X_test) # 转换测试集


            X_train.drop(f_cat,axis=1,inplace=True)
            X_valid.drop(f_cat,axis=1,inplace=True)
            X_test.drop(f_cat,axis=1,inplace=True)


        # 对于当前折，跑XGB
        if OPTIMIZE_ROUNDS:
            eval_set=[(X_valid,y_valid)]
            fit_model = model.fit( X_train, y_train, 
                                   eval_set=eval_set,
                                   eval_metric=gini_xgb,
                                   early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                                   verbose=False
                                 )
            print( "  Best N trees = ", model.best_ntree_limit )
            print( "  Best gini = ", model.best_score )
        else:
            fit_model = model.fit( X_train, y_train )

        # 生成验证集的预测结果
        pred = fit_model.predict_proba(X_valid)[:,1]
        print( "  normalized gini coefficent = ", eval_gini(y_valid, pred) )
        y_valid_pred.iloc[test_index] = pred

        # 累积计算测试集预测结果
        y_test_pred += fit_model.predict_proba(X_test)[:,1]

        del X_test, X_train, X_valid, y_train

    y_test_pred /= K  # 取各fold结果均值

    print( "\n整个训练集（合并）的normalized gini coefficent:" )
    print( "  final normalized gini coefficent = ", eval_gini(y, y_valid_pred) )
    
    return y_test_pred,eval_gini(y, y_valid_pred)

In [9]:
%%time
y_test_pred, gini_score = XGB_gini(df_train=final_train,tar_enc=True)


0折交叉验证： 
  normalized gini coefficent =  0.2521255881298117

1折交叉验证： 
  normalized gini coefficent =  0.30787820484790185

2折交叉验证： 
  normalized gini coefficent =  0.2812042717533373

3折交叉验证： 
  normalized gini coefficent =  0.2824787109934668

4折交叉验证： 
  normalized gini coefficent =  0.2895325604792781

5折交叉验证： 
  normalized gini coefficent =  0.2914708137368027

6折交叉验证： 
  normalized gini coefficent =  0.2788781419900316

7折交叉验证： 
  normalized gini coefficent =  0.2741946407666682

8折交叉验证： 
  normalized gini coefficent =  0.27019773113149437

9折交叉验证： 
  normalized gini coefficent =  0.3091194217515304

整个训练集（合并）的normalized gini coefficent:
  final normalized gini coefficent =  0.2835429751767702
Wall time: 2min 30s


In [13]:
submission = pd.DataFrame()
submission['id'] = final_test.index.values
submission['target'] = y_test_pred
submission.to_csv('xgb_submit.csv', float_format='%.6f', index=False)