In [53]:
import numpy as np
import pandas as pd

# 讀取訓練資料、測試資料

In [55]:
# 讀取訓練資料、測試資料
train = pd.read_csv('../input/train.csv', na_values=['', 'NULL'])
test = pd.read_csv('../input/test.csv', na_values=['', 'NULL'])

# 分別取出訓練資料的特徵和標籤
train_x = train.drop(['label'], axis=1)
train_y = train['label']

# 由於測試資料只有特徵，維持原樣複製一份即可
test_x = test.copy()


# 建立特徴

In [56]:
# 月份對應
month_dict = {'January': 1,
              'February': 2,
              'March':  3,
              'April':  4,
              'May': 5,
              'June':  6,
              'July':  7,
              'August':  8,
              'September':  9,
              'October': 10,
              'November':  11,
              'December':  12}

train_x["previous_connect_month"] = train_x["previous_connect_month"].map(month_dict)
test_x["previous_connect_month"] = test_x["previous_connect_month"].map(month_dict)


In [57]:
# 處理缺失值
# 全部填入眾數    
for col_name in train_x.columns:
    mode = train_x[col_name].mode()[0] 
    print(col_name, mode)
    train_x[col_name].fillna(mode, inplace=True)
    test_x[col_name].fillna(mode, inplace=True)    

    

index 0
age 31.0
euducation_level high
job employment
marital married
have_credit_card no
have_housing_loan yes
have_personal_loan no
connect_method cellular
previous_connect_month 5.0
previous_connect_weekday Thursday
campaign_connect_times 1.0
after_campaign_connect_day -1.0
before_campaign_connect_times 0.0
last_campaign_outcomes nonexistent
employment_rate 61.4
consumer_price_index 93.994
consumer_confidence_index -36.4


In [58]:
# 類別資料 one-hot encoding

ohe_cols = ['euducation_level', 'job', 'marital', 'have_credit_card', 'have_housing_loan', 'have_personal_loan',
            'connect_method', 'previous_connect_month', 'previous_connect_weekday', 'last_campaign_outcomes']

# 整合訓練與測試資料，執行 One-hot encoding
all_x = pd.concat([train_x, test_x])
all_x = pd.get_dummies(all_x, columns=ohe_cols)

# 重新分割訓練、測試資料
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)


In [59]:
# 數值資料 標準化




In [60]:
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.height', 1000)
# pd.set_option('display.width', 1000)

train_x

Unnamed: 0,index,age,campaign_connect_times,after_campaign_connect_day,before_campaign_connect_times,employment_rate,consumer_price_index,consumer_confidence_index,euducation_level_high,euducation_level_low,euducation_level_medium,job_employment,job_retired,job_student,job_unemployed,marital_divorced,marital_married,marital_single,marital_unknown,have_credit_card_no,have_credit_card_unknown,have_credit_card_yes,have_housing_loan_no,have_housing_loan_unknown,have_housing_loan_yes,have_personal_loan_no,have_personal_loan_unknown,have_personal_loan_yes,connect_method_cellular,connect_method_telephone,previous_connect_month_3.0,previous_connect_month_4.0,previous_connect_month_5.0,previous_connect_month_6.0,previous_connect_month_7.0,previous_connect_month_8.0,previous_connect_month_9.0,previous_connect_month_10.0,previous_connect_month_11.0,previous_connect_month_12.0,previous_connect_weekday_Friday,previous_connect_weekday_Monday,previous_connect_weekday_Thursday,previous_connect_weekday_Tuesday,previous_connect_weekday_Wednesday,last_campaign_outcomes_failure,last_campaign_outcomes_nonexistent,last_campaign_outcomes_success
0,0,39.0,1.0,-1.0,0.0,59.9,93.200,-42.0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
1,1,26.0,3.0,-1.0,0.0,58.2,93.075,-47.1,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,2,29.0,4.0,-1.0,0.0,61.4,94.465,-41.8,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19997,19997,45.0,2.0,-1.0,0.0,61.4,93.444,-36.1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
19998,19998,53.0,1.0,-1.0,0.0,57.1,92.963,-40.8,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
19999,19999,45.0,4.0,-1.0,0.0,61.4,93.918,-42.7,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0


In [61]:
# 特徵提取 加入PCA




In [62]:
# 不平衡資料的處理

# 建立個測試模型 (試試水溫)

In [77]:
def WCatAcc(pred, labe):
    pred = np.array(pred)
    labe = np.array(labe)
    assert  (pred.shape == labe.shape)
    # print('pred',pred.dtype, pred.shape, pred)
    # print('labe',labe.dtype, labe.shape, labe)    
    positive_mask = (pred == labe)
    # print('positive_mask', positive_mask)
    n_true_positive = np.sum(pred[positive_mask])
    n_true_negative = np.sum(positive_mask)-n_true_positive
    n_positive_point = np.sum(labe)
    n_negative_point = labe.size - n_positive_point
    # print('TP: ', n_true_positive,
    #       'TN: ', n_true_negative,
    #       'P: ',  n_positive_point,
    #       'N: ', n_negative_point)
    score = (9*n_true_positive+1*n_true_negative) / (9*n_positive_point+1*n_negative_point)
    # print('score', score)
    return score

WCatAcc([0,0,1,1,1,0,1],[0,0,1,1,1,1,0])


0.7435897435897436

In [64]:
from xgboost import XGBClassifier
# pip install xgboost

# 建立模型及餵入訓練資料 (與標籤) 以進行學習
model = XGBClassifier(n_estimators=20)
model.fit(train_x, train_y)






XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=40,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [78]:
# 尋找最佳解

from scipy.optimize import minimize

in_sample_pred = model.predict_proba(train_x)[:, 1]
in_sample_pred_label = np.where(in_sample_pred > 0.5, 1, 0)
best_score = WCatAcc(in_sample_pred_label, train_y)

def WCatAcc_opt(x):
    return -WCatAcc(np.where(in_sample_pred > x, 1, 0), train_y)


result = minimize(WCatAcc_opt, x0=np.array([0.5]), method='Nelder-Mead')
best_threshold = result['x'].item()
pred_label = np.where(in_sample_pred > best_threshold, 1, 0)
best_score = WCatAcc(pred_label, train_y)
print(best_threshold, best_score)

0.09453124999999964 0.7869086460032626


In [81]:
# 餵入測試資料以輸出預測值
pred = model.predict_proba(test_x)[:, 1]

# 將大於 0.5 的預測值轉為 1、小於等於 0.5 則轉成 0
pred_label = np.where(pred > best_threshold, 1, 0)

# 建立提交用的檔案
submission = pd.DataFrame({'index': test['index'], 'label': pred_label})
submission.to_csv('../submission/submission_xgboost_01.csv', index=False)

In [17]:
pred > 0.5

array([False, False, False, ..., False, False, False])

# 建立模型

In [None]:
# -----------------------------------
# 驗證
# -----------------------------------
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

# 用 List 保存各 fold 的 accuracy 與 logloss 分數
scores_accuracy = []
scores_logloss = []

# 進行交叉驗證
# 將資料分成四組，一組用於驗證，其餘三組用於訓練，並且輪替四次
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    # 將資料分為訓練資料和驗證資料 (標籤也是)
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # 建立 xgboost 模型並餵入訓練資料與標籤進行學習
    model = XGBClassifier(n_estimators=20, random_state=71)
    model.fit(tr_x, tr_y)

    # 對驗證資料進行預測，輸出預測值的準確率
    va_pred = model.predict_proba(va_x)[:, 1]

    # 計算驗證資料預測值的評價指標，用 logloss 及 accuracy 來算誤差
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)

    # 保存該 fold 的評價指標
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)

# 輸出各 fold 評價指標的平均值
logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)
print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')
# logloss: 0.4270, accuracy: 0.8148 (可能跟書中的數值略有不同)

# -----------------------------------
# 模型調整
# -----------------------------------
import itertools

# 準備用於調整的參數
param_space = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1.0, 2.0, 4.0]
}

# 產生超參數 max_depth 與 min_child_weight 的所有組合
param_combinations = itertools.product(param_space['max_depth'], param_space['min_child_weight'])

# 用 List 保存各參數組合以及各組合的分數
params = []
scores = []

# 對各參數組合的模型進行交叉驗證
for max_depth, min_child_weight in param_combinations:

    score_folds = []
    # 進行交叉驗證
    # 將訓練資料分成4分，其中一個作為驗證資料，並不斷輪替交換
    kf = KFold(n_splits=4, shuffle=True, random_state=123456)
    for tr_idx, va_idx in kf.split(train_x):
        # 將資料分為訓練資料與驗證資料
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

        # 建立 xgboost 模型並進行訓練
        model = XGBClassifier(n_estimators=20, random_state=71,
                              max_depth=max_depth, min_child_weight=min_child_weight)
        model.fit(tr_x, tr_y)

        # 驗證資料的預測值與 logloss 評價指標
        va_pred = model.predict_proba(va_x)[:, 1]
        logloss = log_loss(va_y, va_pred)
        score_folds.append(logloss)

    # 將各 fold 的評價指標進行平均
    score_mean = np.mean(score_folds)

    # 保存參數的組合以及其相對應的評價指標
    params.append((max_depth, min_child_weight))
    scores.append(score_mean)

# 找出將評價指標分數最佳的參數組合
best_idx = np.argsort(scores)[0]
best_param = params[best_idx]
print(f'max_depth: {best_param[0]}, min_child_weight: {best_param[1]}')
# max_depth=7, min_child_weight=2.0 為最佳超參數組合

