In [360]:
import numpy as np
import pandas as pd
import math   


# 讀取訓練資料、測試資料

In [361]:
# 讀取訓練資料、測試資料
train = pd.read_csv('../input/train.csv', na_values=['', 'NULL'])
test = pd.read_csv('../input/test.csv', na_values=['', 'NULL'])

# 分別取出訓練資料的特徵和標籤
train_x = train.drop(['label'], axis=1)
train_y = train['label']

# 由於測試資料只有特徵，維持原樣複製一份即可
test_x = test.copy()


# 建立特徴

In [362]:
# 月份對應
month_dict = {'January': 1,
              'February': 2,
              'March':  3,
              'April':  4,
              'May': 5,
              'June':  6,
              'July':  7,
              'August':  8,
              'September':  9,
              'October': 10,
              'November':  11,
              'December':  12}

train_x["previous_connect_month"] = train_x["previous_connect_month"].map(month_dict)
test_x["previous_connect_month"] = test_x["previous_connect_month"].map(month_dict)


In [363]:
# 處理缺失值
# 全部填入眾數    
for col_name in train_x.columns:
    mode = train_x[col_name].mode()[0] 
    print(col_name, mode)
    train_x[col_name].fillna(mode, inplace=True)
    test_x[col_name].fillna(mode, inplace=True)    

    

index 0
age 31.0
euducation_level high
job employment
marital married
have_credit_card no
have_housing_loan yes
have_personal_loan no
connect_method cellular
previous_connect_month 5.0
previous_connect_weekday Thursday
campaign_connect_times 1.0
after_campaign_connect_day -1.0
before_campaign_connect_times 0.0
last_campaign_outcomes nonexistent
employment_rate 61.4
consumer_price_index 93.994
consumer_confidence_index -36.4


In [364]:
# 類別資料 one-hot encoding

ohe_cols = ['euducation_level', 'job', 'marital', 'have_credit_card', 'have_housing_loan', 'have_personal_loan',
            'connect_method', 'previous_connect_month', 'previous_connect_weekday', 'last_campaign_outcomes']

# 整合訓練與測試資料，執行 One-hot encoding
all_x = pd.concat([train_x, test_x])
all_x = pd.get_dummies(all_x, columns=ohe_cols)

# 重新分割訓練、測試資料
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)


In [365]:
# 數值資料 標準化




In [None]:
# 數值資料 常態化

In [366]:
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.height', 1000)
# pd.set_option('display.width', 1000)

train_x

Unnamed: 0,index,age,campaign_connect_times,after_campaign_connect_day,before_campaign_connect_times,employment_rate,consumer_price_index,consumer_confidence_index,euducation_level_high,euducation_level_low,euducation_level_medium,job_employment,job_retired,job_student,job_unemployed,marital_divorced,marital_married,marital_single,marital_unknown,have_credit_card_no,have_credit_card_unknown,have_credit_card_yes,have_housing_loan_no,have_housing_loan_unknown,have_housing_loan_yes,have_personal_loan_no,have_personal_loan_unknown,have_personal_loan_yes,connect_method_cellular,connect_method_telephone,previous_connect_month_3.0,previous_connect_month_4.0,previous_connect_month_5.0,previous_connect_month_6.0,previous_connect_month_7.0,previous_connect_month_8.0,previous_connect_month_9.0,previous_connect_month_10.0,previous_connect_month_11.0,previous_connect_month_12.0,previous_connect_weekday_Friday,previous_connect_weekday_Monday,previous_connect_weekday_Thursday,previous_connect_weekday_Tuesday,previous_connect_weekday_Wednesday,last_campaign_outcomes_failure,last_campaign_outcomes_nonexistent,last_campaign_outcomes_success
0,0,39.0,1.0,-1.0,0.0,59.9,93.200,-42.0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
1,1,26.0,3.0,-1.0,0.0,58.2,93.075,-47.1,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,2,29.0,4.0,-1.0,0.0,61.4,94.465,-41.8,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19997,19997,45.0,2.0,-1.0,0.0,61.4,93.444,-36.1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
19998,19998,53.0,1.0,-1.0,0.0,57.1,92.963,-40.8,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
19999,19999,45.0,4.0,-1.0,0.0,61.4,93.918,-42.7,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0


In [367]:
# 特徵提取 加入PCA
from sklearn.decomposition import PCA

# 定義以訓練資料來進行PCA轉換
pca = PCA(n_components=6)
pca.fit(np.concatenate((train_x, test_x), axis=0))

# 進行轉換
train_x = np.concatenate((train_x, pca.transform(train_x)), axis=1)
test_x = np.concatenate((test_x, pca.transform(test_x)), axis=1)
# print(train_x.shape)
# print(test_x.shape)
train_x = pd.DataFrame(train_x)
test_x = pd.DataFrame(test_x)



In [368]:
# 不平衡資料的處理

In [369]:
# 特別對評價指標去最佳化

# 建立個測試模型 (試試水溫)

In [370]:
# Weighted Categroies Accuracy
def WCatAcc(pred, labe):
    pred = np.array(pred)
    labe = np.array(labe)
    assert (pred.shape == labe.shape)
    assert (pred.dtype == labe.dtype)
    # print('pred',pred.dtype, pred.shape, pred)
    # print('labe',labe.dtype, labe.shape, labe)
    positive_mask = (pred == labe)
    # print('positive_mask', positive_mask)
    n_true_positive = np.sum(pred[positive_mask])
    n_true_negative = np.sum(positive_mask)-n_true_positive
    n_positive_point = np.sum(labe)
    n_negative_point = labe.size - n_positive_point
    # print('TP: ', n_true_positive,
    #       'TN: ', n_true_negative,
    #       'P: ',  n_positive_point,
    #       'N: ', n_negative_point)
    score = (9*n_true_positive+1*n_true_negative) / \
        (9*n_positive_point+1*n_negative_point)
    # print(f'score: {score:.4f}')
    return score


WCatAcc([0, 0, 1, 1, 1, 0, 1], [0, 0, 1, 1, 1, 1, 0])


0.7435897435897436

In [371]:
from xgboost import XGBClassifier
# pip install xgboost

# 建立模型及餵入訓練資料 (與標籤) 以進行學習
model = XGBClassifier(n_estimators=20)
model.fit(train_x, train_y)






XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=40,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [372]:
# 尋找最佳解
from scipy.optimize import minimize

in_sample_pred = model.predict_proba(train_x)[:, 1]
in_sample_pred_label = np.where(in_sample_pred > 0.5, 1, 0)
best_score = WCatAcc(in_sample_pred_label, train_y)

def WCatAcc_opt(x):
    return -WCatAcc(np.where(in_sample_pred > x, 1, 0), train_y)

result = minimize(WCatAcc_opt, x0=np.array([0.5]), method='Nelder-Mead')
best_threshold = result['x'].item()
pred_label = np.where(in_sample_pred > best_threshold, 1, 0)
best_score = WCatAcc(pred_label, train_y)
print(best_threshold, best_score)


0.09287109374999966 0.8120355265542868


In [373]:
# 餵入測試資料以輸出預測值
pred = model.predict_proba(test_x)[:, 1]

# 將大於 0.5 的預測值轉為 1、小於等於 0.5 則轉成 0
pred_label = np.where(pred > best_threshold, 1, 0)

# 建立提交用的檔案
submission = pd.DataFrame({'index': test['index'], 'label': pred_label})
submission.to_csv('../submission/submission_xgboost_01.csv', index=False)

# 建立模型

In [374]:
# 設定超參數
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss', # !!!!!
    # 'eta': 0.1, # learning rate
    'subsample': 0.95,  # 抽row
    # 'colsample_bytree': 0.8, # 抽col
    'min_child_weight': 10,  # 每個葉子至少要包含幾筆資料
    'max_depth': 12,  # 決策樹深度限制
    # 'random_state': 71,
}

In [375]:
# 驗證
from sklearn.metrics import log_loss
import xgboost as xgb
from sklearn.model_selection import KFold

dtest = xgb.DMatrix(test_x)
kfold_model_list = []
best_threshold_list = []
n_tree_limit_list = []
logloss_arr = []
wc_acc_arr = []
kf = KFold(n_splits=4, shuffle=True, random_state=9912)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # -----------------------------------
    # 使用 xgboost
    # -----------------------------------

    # 將特徵和標籤轉換為 xgboost 的資料結構
    dtrain = xgb.DMatrix(tr_x, label=tr_y)
    dvalid = xgb.DMatrix(va_x, label=va_y)

    # 在 watchlist 中組合訓練資料與驗證資料
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

    # 進行訓練，將驗證資料代入模型中，一面訓練模型，一面監控分數的變化
    model = xgb.train(params, dtrain,
                      num_boost_round=1000,  # 決策樹數量限制
                      evals=watchlist,
                      early_stopping_rounds=10,  # 提前中止
                      verbose_eval=False,
                      )

    # 計算驗證資料的 logloss 分數
    va_pred = model.predict(dvalid)
    logloss_score = log_loss(va_y, va_pred)
    logloss_arr.append(logloss_score)
    print(f'logloss: {logloss_score:.4f}')

    # 尋找最佳解
    tr_pred = model.predict(dtrain)

    def WCatAcc_opt(x):
        return -WCatAcc(np.where(tr_pred > x, 1, 0), tr_y)

    result = minimize(WCatAcc_opt, x0=np.array([0.5]), method='Nelder-Mead')
    best_threshold = result['x'].item()

    pred_label = np.where(va_pred > best_threshold, 1, 0)
    best_score = WCatAcc(pred_label, va_y)
    wc_acc_arr.append(best_score)
    print(
        f'best_threshold: {best_threshold:.4f}, best_score: {best_score:.4f}, best_ntree_limit: {model.best_ntree_limit:.4f}')

    kfold_model_list.append(model)
    best_threshold_list.append(best_threshold)
    n_tree_limit_list.append(model.best_ntree_limit)

print('-------------------------------------------\n',
      'average logloss: {:.4f}, avg wc acc: {:.4f}'.format(np.mean(logloss_arr), np.mean(wc_acc_arr)))


logloss: 0.3024
best_threshold: 0.1035, best_score: 0.7500, best_ntree_limit: 14.0000
logloss: 0.3192
best_threshold: 0.0946, best_score: 0.7417, best_ntree_limit: 12.0000
logloss: 0.3301
best_threshold: 0.1078, best_score: 0.7318, best_ntree_limit: 12.0000
logloss: 0.3087
best_threshold: 0.1019, best_score: 0.7572, best_ntree_limit: 11.0000
-------------------------------------------
 average logloss: 0.3151, avg wc acc: 0.7452


In [376]:
# 集成多個驗證模型

kfold_submission = []
for i in range(len(kfold_model_list)):
    # 以最佳決策樹的數量來進行預測
    pred = kfold_model_list[i].predict(dtest)#, iteration_range=(n_tree_limit_list[i]-5, n_tree_limit_list[i]+5))
    pred_label = np.where(pred > best_threshold_list[i], 1, 0)
    kfold_submission.append(pred_label)

# 建立提交用的檔案
kfold_submission = np.array(kfold_submission)
kfold_submission = kfold_submission.mean(axis=0)
kfold_submission = np.where(kfold_submission > 0.5, 1, 0)
submission = pd.DataFrame({'index': test['index'], 'label': kfold_submission})
submission.to_csv('../submission/submission_xgboost_02.csv', index=False)


In [377]:
# 以全部資料重新訓練

ntree_limit_avg = math.floor(np.mean(n_tree_limit))
best_threshold_avg = np.mean(best_threshold_list)

# 將特徵和標籤轉換為 xgboost 的資料結構
all_train = xgb.DMatrix(train_x, label=train_y)

ensemble_submission = []
for i in range(32):
    params['random_state'] = np.random.randint(9999)+1
    model = xgb.train(params, all_train, num_boost_round=ntree_limit_avg+5)
    pred = model.predict(dtest)
    pred_label = np.where(pred > best_threshold_avg, 1, 0)
    ensemble_submission.append(pred_label)

# 建立提交用的檔案
ensemble_submission = np.array(ensemble_submission)
pd.DataFrame(ensemble_submission.transpose()).to_csv("ensemble.csv") # 看看
ensemble_submission = ensemble_submission.mean(axis=0)
ensemble_submission = np.where(ensemble_submission > 0.5, 1, 0)
submission = pd.DataFrame(
    {'index': test['index'], 'label': ensemble_submission})
submission.to_csv('../submission/submission_xgboost_03.csv', index=False)
