In [1]:
import numpy as np
import pandas as pd
import math   


# 讀取訓練資料、測試資料

In [2]:
# 讀取訓練資料、測試資料
train = pd.read_csv('../input/train.csv', na_values=['', 'NULL'])
test = pd.read_csv('../input/test.csv', na_values=['', 'NULL'])

# 分別取出訓練資料的特徵和標籤
train_x = train.drop(['label'], axis=1)
train_y = train['label']

# 由於測試資料只有特徵，維持原樣複製一份即可
test_x = test.copy()


# 建立特徴

In [3]:
# 月份對應
month_dict = {'January': 1,
              'February': 2,
              'March':  3,
              'April':  4,
              'May': 5,
              'June':  6,
              'July':  7,
              'August':  8,
              'September':  9,
              'October': 10,
              'November':  11,
              'December':  12}

train_x["previous_connect_month"] = train_x["previous_connect_month"].map(month_dict)
test_x["previous_connect_month"] = test_x["previous_connect_month"].map(month_dict)


In [4]:
# 處理缺失值
# 全部填入眾數    
for col_name in train_x.columns:
    mode = train_x[col_name].mode()[0] 
    print(f"{col_name}的眾數 -> {mode}")
    train_x[col_name].fillna(mode, inplace=True)
    test_x[col_name].fillna(mode, inplace=True)    

    

index的眾數 -> 0
age的眾數 -> 31.0
euducation_level的眾數 -> high
job的眾數 -> employment
marital的眾數 -> married
have_credit_card的眾數 -> no
have_housing_loan的眾數 -> yes
have_personal_loan的眾數 -> no
connect_method的眾數 -> cellular
previous_connect_month的眾數 -> 5.0
previous_connect_weekday的眾數 -> Thursday
campaign_connect_times的眾數 -> 1.0
after_campaign_connect_day的眾數 -> -1.0
before_campaign_connect_times的眾數 -> 0.0
last_campaign_outcomes的眾數 -> nonexistent
employment_rate的眾數 -> 61.4
consumer_price_index的眾數 -> 93.994
consumer_confidence_index的眾數 -> -36.4


In [5]:
# 類別資料 one-hot encoding

ohe_cols = ['euducation_level', 'job', 'marital', 'have_credit_card', 'have_housing_loan', 'have_personal_loan',
            'connect_method', 'previous_connect_month', 'previous_connect_weekday', 'last_campaign_outcomes']

# 整合訓練與測試資料，執行 One-hot encoding
all_x = pd.concat([train_x, test_x])
all_x = pd.get_dummies(all_x, columns=ohe_cols)

# 重新分割訓練、測試資料
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)


In [6]:
# # 數值資料 標準化
# from sklearn.preprocessing import StandardScaler

# std_cols = ['age', 'campaign_connect_times', 'after_campaign_connect_day',
#             'before_campaign_connect_times', 'employment_rate', 'consumer_price_index', 'consumer_confidence_index']

# # 結合訓練資料和測試資料並計算平均及標準差，稍後以此為基礎來進行標準化
# scaler = StandardScaler()
# scaler.fit(pd.concat([train_x[std_cols], test_x[std_cols]]))

# # 進行標準化並置換各欄位原數值

# # [取代]
# train_x[std_cols] = scaler.transform(train_x[std_cols])
# test_x[std_cols] = scaler.transform(test_x[std_cols])

# # # [附加]
# # std_train = pd.DataFrame(scaler.transform(train_x[std_cols]),
# #                          columns=[('s_'+n) for n in std_cols])
# # std_test = pd.DataFrame(scaler.transform(test_x[std_cols]),
# #                          columns=[('s_'+n) for n in std_cols])
# # # 將轉換後的 dataframe 跟其他特徵結合
# # train_x = pd.concat([train_x, std_train], axis=1)
# # test_x = pd.concat([test_x, std_test], axis=1)


In [7]:
# # 數值資料 常態化
# # Yeo-Johnson 轉換
# from sklearn.preprocessing import PowerTransformer

# pt_cols = ['age', 'campaign_connect_times', 'after_campaign_connect_day',
#             'before_campaign_connect_times', 'employment_rate', 'consumer_price_index', 'consumer_confidence_index']

# # 定義以訓練資料來進行多欄位的 Yeo-Johnson 轉換
# pt = PowerTransformer(method='yeo-johnson')
# pt.fit(pd.concat([train_x[pt_cols], test_x[pt_cols]]))

# # 進行標準化並置換各欄位原數值

# # [取代]
# train_x[pt_cols] = pt.transform(train_x[pt_cols])
# test_x[pt_cols] = pt.transform(test_x[pt_cols])

# # [附加]
# # std_train = pd.DataFrame(pt.transform(train_x[pt_cols]),
# #                          columns=[('pt_'+n) for n in pt_cols])
# # std_test = pd.DataFrame(pt.transform(test_x[pt_cols]),
# #                          columns=[('pt_'+n) for n in pt_cols])
# # # 將轉換後的 dataframe 跟其他特徵結合
# # train_x = pd.concat([train_x, std_train], axis=1)
# # test_x = pd.concat([test_x, std_test], axis=1)

In [8]:
# 特徵提取 加入PCA
from sklearn.decomposition import PCA

num_components = 3

# 定義以訓練資料來進行PCA轉換
pca = PCA(n_components=num_components)
pca.fit(np.concatenate((train_x, test_x), axis=0))

# 進行轉換 (附加)
pca_train = pd.DataFrame(pca.transform(train_x),
                         columns=[('pca_'+str(c)) for c in range(num_components)])
pca_test = pd.DataFrame(pca.transform(test_x),
                        columns=[('pca_'+str(c)) for c in range(num_components)])

# 將轉換後的 dataframe 跟其他特徵結合
train_x = pd.concat([train_x, pca_train], axis=1)
test_x = pd.concat([test_x, pca_test], axis=1)


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [9]:
# 特徵提取 加入t-SNE (1)
# pip install bhtsne
import bhtsne

# 進行 t-sne 的轉換
tsne_x = np.concatenate((train_x, test_x), axis=0).astype(np.float64)
tsne_x = bhtsne.tsne(tsne_x)


Using current time as random seed...
Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
Computing input similarities...
Building tree...
 - point 0 of 39099
 - point 10000 of 39099
 - point 20000 of 39099
 - point 30000 of 39099
Input similarities computed in 5.43 seconds (sparsity = 0.002355)!
Learning embedding...
Iteration 50: error is 115.519580 (50 iterations in 21.78 seconds)
Iteration 100: error is 115.498715 (50 iterations in 18.98 seconds)
Iteration 150: error is 97.860316 (50 iterations in 19.27 seconds)
Iteration 200: error is 88.849945 (50 iterations in 21.27 seconds)
Iteration 250: error is 4.492849 (50 iterations in 23.14 seconds)
Iteration 300: error is 3.717700 (50 iterations in 19.28 seconds)
Iteration 350: error is 3.277674 (50 iterations in 20.61 seconds)
Iteration 400: error is 2.973949 (50 iterations in 20.39 seconds)
Iteration 450: error is 2.736623 (50 iterations in 20.70 seconds)
Iteration 500: error is 2.540316 (50 iterations in 21.96 seconds)
Iter

In [10]:
# 特徵提取 加入t-SNE (2)
tsne_train_x = tsne_x[:np.array(train_x).shape[0],:]
tsne_test_x = tsne_x[-np.array(test_x).shape[0]:,:]

print(tsne_train_x.shape)
print(tsne_test_x.shape)

# 進行轉換 (附加)
tsne_train = pd.DataFrame(tsne_train_x, columns=[('tsne_'+str(c)) for c in range(2)])
tsne_test = pd.DataFrame(tsne_test_x, columns=[('tsne_'+str(c)) for c in range(2)])

# 將轉換後的 dataframe 跟其他特徵結合
train_x = pd.concat([train_x, tsne_train], axis=1)
test_x = pd.concat([test_x, tsne_test], axis=1)


(20000, 2)
(19099, 2)


In [11]:
# 特徵提取 - UMAP 轉換 (1)
# pip install umap-learn
import umap

# 定義訓練資料的 
um = umap.UMAP()
um.fit(np.concatenate((train_x, test_x), axis=0))

# 執行轉換
um_train_x = um.transform(train_x)
um_test_x = um.transform(test_x)

  "Graph is not fully connected, spectral embedding may not work as expected."


In [12]:
# 特徵提取 - UMAP 轉換 (2)
print(um_train_x.shape)
print(um_test_x.shape)

# 進行轉換 (附加)
um_train = pd.DataFrame(um_train_x, columns=[('um_'+str(c)) for c in range(2)])
um_test = pd.DataFrame(um_test_x, columns=[('um_'+str(c)) for c in range(2)])

# 將轉換後的 dataframe 跟其他特徵結合
train_x = pd.concat([train_x, um_train], axis=1)
test_x = pd.concat([test_x, um_test], axis=1)

(20000, 2)
(19099, 2)


In [13]:
# train_x = train_x.drop('um_0', axis=1)
# train_x = train_x.drop('um_1', axis=1)
# test_x = test_x.drop('um_0', axis=1)
# test_x = test_x.drop('um_1', axis=1)

In [14]:
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.height', 1000)
# pd.set_option('display.width', 1000)

train_x.to_csv("train_x.csv")
train_y.to_csv("train_y.csv")
test_x.to_csv("test_x.csv")
display(train_x)
display(test_x)

Unnamed: 0,index,age,campaign_connect_times,after_campaign_connect_day,before_campaign_connect_times,employment_rate,consumer_price_index,consumer_confidence_index,euducation_level_high,euducation_level_low,euducation_level_medium,job_employment,job_retired,job_student,job_unemployed,marital_divorced,marital_married,marital_single,marital_unknown,have_credit_card_no,have_credit_card_unknown,have_credit_card_yes,have_housing_loan_no,have_housing_loan_unknown,have_housing_loan_yes,have_personal_loan_no,have_personal_loan_unknown,have_personal_loan_yes,connect_method_cellular,connect_method_telephone,previous_connect_month_3.0,previous_connect_month_4.0,previous_connect_month_5.0,previous_connect_month_6.0,previous_connect_month_7.0,previous_connect_month_8.0,previous_connect_month_9.0,previous_connect_month_10.0,previous_connect_month_11.0,previous_connect_month_12.0,previous_connect_weekday_Friday,previous_connect_weekday_Monday,previous_connect_weekday_Thursday,previous_connect_weekday_Tuesday,previous_connect_weekday_Wednesday,last_campaign_outcomes_failure,last_campaign_outcomes_nonexistent,last_campaign_outcomes_success,pca_0,pca_1,pca_2,tsne_0,tsne_1,um_0,um_1
0,0,39.0,1.0,-1.0,0.0,59.9,93.200,-42.0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,-19548.998761,-7.435478,-1.290395,-31.752860,-43.149301,-7.895451,14.617671
1,1,26.0,3.0,-1.0,0.0,58.2,93.075,-47.1,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,-19547.998543,-7.562352,-14.382928,-31.730091,-43.126107,-7.825473,14.549379
2,2,29.0,4.0,-1.0,0.0,61.4,94.465,-41.8,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,-19546.998920,-6.171875,-11.282701,-31.729426,-43.124467,-7.824812,14.548975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19997,19997,45.0,2.0,-1.0,0.0,61.4,93.444,-36.1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,448.000513,-2.382800,5.099520,32.532953,-2.256494,10.087873,0.128053
19998,19998,53.0,1.0,-1.0,0.0,57.1,92.963,-40.8,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,449.000513,-2.861671,13.016290,32.527169,-2.250048,10.064445,0.156101
19999,19999,45.0,4.0,-1.0,0.0,61.4,93.918,-42.7,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,450.000402,-1.907878,4.988704,32.576463,-2.201995,10.101377,0.103124


Unnamed: 0,index,age,campaign_connect_times,after_campaign_connect_day,before_campaign_connect_times,employment_rate,consumer_price_index,consumer_confidence_index,euducation_level_high,euducation_level_low,euducation_level_medium,job_employment,job_retired,job_student,job_unemployed,marital_divorced,marital_married,marital_single,marital_unknown,have_credit_card_no,have_credit_card_unknown,have_credit_card_yes,have_housing_loan_no,have_housing_loan_unknown,have_housing_loan_yes,have_personal_loan_no,have_personal_loan_unknown,have_personal_loan_yes,connect_method_cellular,connect_method_telephone,previous_connect_month_3.0,previous_connect_month_4.0,previous_connect_month_5.0,previous_connect_month_6.0,previous_connect_month_7.0,previous_connect_month_8.0,previous_connect_month_9.0,previous_connect_month_10.0,previous_connect_month_11.0,previous_connect_month_12.0,previous_connect_weekday_Friday,previous_connect_weekday_Monday,previous_connect_weekday_Thursday,previous_connect_weekday_Tuesday,previous_connect_weekday_Wednesday,last_campaign_outcomes_failure,last_campaign_outcomes_nonexistent,last_campaign_outcomes_success,pca_0,pca_1,pca_2,tsne_0,tsne_1,um_0,um_1
0,20000,30.0,6.0,-1.0,0.0,61.4,94.465,-41.8,0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,451.000487,-1.363526,-10.002076,32.649554,-2.145056,10.135636,0.058273
1,20001,48.0,1.0,-1.0,1.0,58.2,93.369,-34.8,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,452.000487,-2.456280,8.123109,32.622947,-2.135458,10.095718,0.114295
2,20002,52.0,2.0,-1.0,0.0,61.4,93.444,-36.1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,453.000411,-2.380254,12.097614,32.639200,-2.106430,10.107089,0.084689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19096,39096,48.0,1.0,-1.0,1.0,58.2,92.893,-46.2,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,19547.000050,1.659446,8.196269,-1.572652,-55.686969,18.104895,-6.843908
19097,39097,31.0,4.0,-1.0,0.0,61.1,93.994,-36.4,0,0,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,19548.000028,2.756802,-8.630240,-1.587328,-55.686677,18.103075,-6.839383
19098,39098,34.0,3.0,-1.0,0.0,61.4,93.444,-36.1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,19549.000118,2.207576,-5.631211,-1.590405,-55.686940,18.096712,-6.845104


In [15]:
# 不平衡資料的處理

In [16]:
# 特別對評價指標去最佳化

# 建立個測試模型 (試試水溫)

In [17]:
# Weighted Categroies Accuracy
def WCatAcc(pred, labe):
    pred = np.array(pred)
    labe = np.array(labe)
    assert (pred.shape == labe.shape)
    assert (pred.dtype == labe.dtype)
    # print('pred',pred.dtype, pred.shape, pred)
    # print('labe',labe.dtype, labe.shape, labe)
    positive_mask = (pred == labe)
    # print('positive_mask', positive_mask)
    n_true_positive = np.sum(pred[positive_mask])
    n_true_negative = np.sum(positive_mask)-n_true_positive
    n_positive_point = np.sum(labe)
    n_negative_point = labe.size - n_positive_point
    # print('TP: ', n_true_positive,
    #       'TN: ', n_true_negative,
    #       'P: ',  n_positive_point,
    #       'N: ', n_negative_point)
    score = (9*n_true_positive+1*n_true_negative) / \
        (9*n_positive_point+1*n_negative_point)
    # print(f'score: {score:.4f}')
    return score


WCatAcc([0, 0, 1, 1, 1, 0, 1], [0, 0, 1, 1, 1, 1, 0])


0.7435897435897436

In [18]:
from xgboost import XGBClassifier
# pip install xgboost

# 建立模型及餵入訓練資料 (與標籤) 以進行學習
model = XGBClassifier(n_estimators=20)
model.fit(train_x, train_y)






XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=40,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
# 尋找最佳解
from scipy.optimize import minimize

in_sample_pred = model.predict_proba(train_x)[:, 1]
in_sample_pred_label = np.where(in_sample_pred > 0.5, 1, 0)
best_score = WCatAcc(in_sample_pred_label, train_y)

def WCatAcc_opt(x):
    return -WCatAcc(np.where(in_sample_pred > x, 1, 0), train_y)

result = minimize(WCatAcc_opt, x0=np.array([0.5]), method='Nelder-Mead')
best_threshold = result['x'].item()
pred_label = np.where(in_sample_pred > best_threshold, 1, 0)
best_score = WCatAcc(pred_label, train_y)
print(f"best_threshold: {best_threshold:.4f}, best_threshold: {best_score:.4f}")


best_threshold: 0.0913, best_threshold: 0.8038


In [20]:
# 餵入測試資料以輸出預測值
pred = model.predict_proba(test_x)[:, 1]

# 將大於 0.5 的預測值轉為 1、小於等於 0.5 則轉成 0
pred_label = np.where(pred > best_threshold, 1, 0)

# 建立提交用的檔案
submission = pd.DataFrame({'index': test['index'], 'label': pred_label})
submission.to_csv('../submission/submission_xgboost_01.csv', index=False)

# 建立模型

In [21]:
# 設定超參數
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss', # !!!!!
    # 'eta': 0.1, # learning rate
    'subsample': 0.95,  # 抽row
    # 'colsample_bytree': 0.8, # 抽col
    'min_child_weight': 10,  # 每個葉子至少要包含幾筆資料
    'max_depth': 12,  # 決策樹深度限制
    # 'random_state': 71,
}

In [22]:
# 驗證
from sklearn.metrics import log_loss
import xgboost as xgb
from sklearn.model_selection import KFold

dtest = xgb.DMatrix(test_x)
kfold_model_list = []
best_threshold_list = []
n_tree_limit_list = []
logloss_arr = []
wc_acc_arr = []
kf = KFold(n_splits=4, shuffle=True, random_state=9912)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # -----------------------------------
    # 使用 xgboost
    # -----------------------------------

    # 將特徵和標籤轉換為 xgboost 的資料結構
    dtrain = xgb.DMatrix(tr_x, label=tr_y)
    dvalid = xgb.DMatrix(va_x, label=va_y)
    # dtrain = xgb.DMatrix(tr_x, label=tr_y, weight=tr_y*0.8+0.1)
    # dvalid = xgb.DMatrix(va_x, label=va_y, weight=va_y*0.8+0.1)

    # 在 watchlist 中組合訓練資料與驗證資料
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

    # 進行訓練，將驗證資料代入模型中，一面訓練模型，一面監控分數的變化
    model = xgb.train(params, dtrain,
                      num_boost_round=1000,  # 決策樹數量限制
                      evals=watchlist,
                      early_stopping_rounds=10,  # 提前中止
                      verbose_eval=False,
                      )

    # 計算驗證資料的 logloss 分數
    va_pred = model.predict(dvalid)
    logloss_score = log_loss(va_y, va_pred)
    logloss_arr.append(logloss_score)
    print(f'logloss: {logloss_score:.4f}')

    # 尋找最佳解
    tr_pred = model.predict(dtrain)

    def WCatAcc_opt(x):
        return -WCatAcc(np.where(tr_pred > x, 1, 0), tr_y)

    result = minimize(WCatAcc_opt, x0=np.array([0.5]), method='Nelder-Mead')
    best_threshold = result['x'].item()

    pred_label = np.where(va_pred > best_threshold, 1, 0)
    best_score = WCatAcc(pred_label, va_y)
    wc_acc_arr.append(best_score)
    print(f'best_threshold: {best_threshold:.4f}, best_score: {best_score:.4f}, best_ntree_limit: {model.best_ntree_limit:.4f}')

    kfold_model_list.append(model)
    best_threshold_list.append(best_threshold)
    n_tree_limit_list.append(model.best_ntree_limit)

print('-------------------------------------------\n',
      'average logloss: {:.4f}, avg wc acc: {:.4f}'.format(np.mean(logloss_arr), np.mean(wc_acc_arr)))


logloss: 0.3048
best_threshold: 0.1244, best_score: 0.7495, best_ntree_limit: 14.0000
logloss: 0.3251
best_threshold: 0.1072, best_score: 0.7334, best_ntree_limit: 11.0000
logloss: 0.3312
best_threshold: 0.1051, best_score: 0.7232, best_ntree_limit: 11.0000
logloss: 0.3123
best_threshold: 0.1103, best_score: 0.7415, best_ntree_limit: 12.0000
-------------------------------------------
 average logloss: 0.3184, avg wc acc: 0.7369


In [23]:
# 集成多個驗證模型

kfold_submission = []
for i in range(len(kfold_model_list)):
    # 以最佳決策樹的數量來進行預測
    pred = kfold_model_list[i].predict(dtest)#, iteration_range=(n_tree_limit_list[i]-5, n_tree_limit_list[i]+5))
    pred_label = np.where(pred > best_threshold_list[i], 1, 0)
    kfold_submission.append(pred_label)

# 建立提交用的檔案
kfold_submission = np.array(kfold_submission)
kfold_submission = kfold_submission.mean(axis=0)
kfold_submission = np.where(kfold_submission > 0.5, 1, 0)
submission = pd.DataFrame({'index': test['index'], 'label': kfold_submission})
submission.to_csv('../submission/submission_xgboost_02.csv', index=False)


In [24]:
# 以全部資料重新訓練

ntree_limit_avg = math.floor(np.mean(n_tree_limit_list))
best_threshold_avg = np.mean(best_threshold_list)

# 將特徵和標籤轉換為 xgboost 的資料結構
all_train = xgb.DMatrix(train_x, label=train_y)
# all_train = xgb.DMatrix(train_x, label=train_y, weight=train_y*0.8+0.1)

ensemble_submission = []
for i in range(32): # 總共要組合幾個模型(32還不錯)
    params['random_state'] = np.random.randint(9999)+1
    model = xgb.train(params, all_train, num_boost_round=ntree_limit_avg+5)
    pred = model.predict(dtest)
    pred_label = np.where(pred > best_threshold_avg, 1, 0)
    ensemble_submission.append(pred_label)

# 建立提交用的檔案
ensemble_submission = np.array(ensemble_submission)
pd.DataFrame(ensemble_submission.transpose()).to_csv("ensemble.csv") # 看看
ensemble_submission = ensemble_submission.mean(axis=0)
ensemble_submission = np.where(ensemble_submission > 0.5, 1, 0)
submission = pd.DataFrame(
    {'index': test['index'], 'label': ensemble_submission})
submission.to_csv('../submission/submission_xgboost_03.csv', index=False)
