In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score

import os

In [123]:
# 데이터 불러오기
train = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/train.csv')
test = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/test.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/sample_submission.csv')

In [124]:
# income 별도로 할당
label = train['income']
del train['income']

In [125]:
# 1) column 제거
def col_reduction(df):
    df.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    return df

# 2) marital_status 조정
def mar_st(df):
    df['marital_status'] = (df['marital_status'] == 'Married-civ-spouse').astype(int)
    
    return df

# 3) race 조정
def race(df):
    df['race'] = ((train['race'] == 'White') | (train['race'] == 'Asian-Pac-Islander')).astype(int)

    return df

# 4) capital_gain, loss 조정
def capital(df):
    df['cap_gain_high'] = (df['capital_gain'] != 0).astype(int)
    df['cap_loss_high'] = (df['capital_loss'] >= 1700).astype(int)
    df['capital_gain'] = df['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    return df

# 5) age 조정 함수
def age(df):
    df.loc[df['age'] < 20, 'age_range'] = '~20'
    df.loc[df['age'] >= 65, 'age_range'] = '~65'

    down = 20
    for i in range(45//5):
        df.loc[(df['age'] >= down) & (df['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5

    df['age'] = df['age_range']
    df.drop(['age_range'], axis=1, inplace=True)
    
    return df
    
# 6) One-hot encoding은 만들지 않았다.

# 7) edu_num 새 변수 만들기
def edu(df):
    df['edu_num_high'] = (df['education_num'] >= 13).astype(int)
    
    return df

# 8) hpw 새 변수 만들기
    
def hpw(df):
    df['hpw_high'] = (df['hours_per_week'] >= 50).astype(int)

    return df

# 9) MinMaxScaler
def mm_feature(df, feature):
    mm_scaler = MinMaxScaler()
    
    df[feature] = mm_scaler.fit_transform(df[feature].values.reshape(-1,1))
    
    return df, mm_scaler

def main(df):
    
    df1 = col_reduction(df)
    df2 = mar_st(df1)
    df3 = race(df2)
    df4 = capital(df3)
    df5 = age(df4)
    
    df6 = pd.get_dummies(df5)
    
    df7 = edu(df6)
    df_fin = hpw(df7)
    
    return df_fin

In [126]:
# 적용
## main: 1) ~ 7)
train = main(train)
X_test = main(test)

## 8) minmax scaler
train, mm_scaler1 = mm_feature(train,'education_num')
train, mm_scaler2 = mm_feature(train,'hours_per_week')

X_test['education_num'] = mm_scaler1.transform(X_test['education_num'].values.reshape(-1,1))
X_test['hours_per_week'] = mm_scaler2.transform(X_test['hours_per_week'].values.reshape(-1,1))

## XGBClassifier

OOF(Out of fold) 앙상블

In [99]:
def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = (y > threshold).astype(int) 
    return 'f1', f1_score(t, y_bin, average='micro')

In [100]:
from sklearn.model_selection import StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)

In [132]:
val_scores = list()
oof_pred = np.zeros((test.shape[0], ))

for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
    x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
    x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]

    # 모델 정의
    clf = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=5, n_estimators=200)
    
    # 모델 학습
    clf.fit(x_train, y_train,
            eval_set = [[x_valid, y_valid]], 
            eval_metric = xgb_f1,        
            early_stopping_rounds = 100,
            verbose = 100,  )

    # 훈련, 검증 데이터 F1 Score 확인
    trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
    val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
    print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
    val_scores.append(val_f1_score)
    
    oof_pred += clf.predict_proba(X_test)[: , 1] / n_splits
    

# 교차 검증 F1 Score 평균 계산하기
print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

[0]	validation_0-error:0.150288	validation_0-f1:0.849712
Multiple eval metrics have been passed: 'validation_0-f1' will be used for early stopping.

Will train until validation_0-f1 hasn't improved in 100 rounds.
[100]	validation_0-error:0.130518	validation_0-f1:0.869482
Stopping. Best iteration:
[1]	validation_0-error:0.150672	validation_0-f1:0.849328

0 Fold, train f1_score : 0.85174, validation f1_score : 0.8493

[0]	validation_0-error:0.144914	validation_0-f1:0.855086
Multiple eval metrics have been passed: 'validation_0-f1' will be used for early stopping.

Will train until validation_0-f1 hasn't improved in 100 rounds.
[100]	validation_0-error:0.131862	validation_0-f1:0.868138
Stopping. Best iteration:
[6]	validation_0-error:0.145681	validation_0-f1:0.854319

1 Fold, train f1_score : 0.85344, validation f1_score : 0.8543

[0]	validation_0-error:0.15048	validation_0-f1:0.84952
Multiple eval metrics have been passed: 'validation_0-f1' will be used for early stopping.

Will train un

In [131]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': [1, 3, 5], 'n_estimators': [50, 100, 200], 'learning_rate':[1, 0.1, 0.01]}
grid = GridSearchCV(XGBClassifier(), param_grid, cv=skf)
grid.fit(x_train, y_train)
best_param = grid.best_params_
best_param

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}

In [135]:
param_grid = {'max_depth': [1, 3, 5], 'n_estimators': [50, 100, 200], 'learning_rate':[1, 0.1, 0.01]}
grid = GridSearchCV(LGBMClassifier(), param_grid, cv=skf)
grid.fit(x_train, y_train)
best_param = grid.best_params_
best_param

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

In [136]:
param_grid = {'n_estimators': [50, 100], 'max_depth': [1,3,8], 'min_samples_leaf' : [3,5], 'min_samples_split' : [2,3]}
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=skf)
grid.fit(x_train, y_train)
best_param = grid.best_params_
best_param

{'max_depth': 8,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 100}

Stacking 앙상블

1) 1stage 결과 모으기

In [137]:
val_scores = list()

new_x_train_list = [np.zeros((train.shape[0], 1)) for _ in range(4)]
new_x_test_list  = [np.zeros((test.shape[0], 1)) for _ in range(4)]

for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
    print(f"Fold {i} Start")
    x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
    x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
    # 모델 정의
    clfs = [LogisticRegression(), 
            RandomForestClassifier(max_depth=8, min_samples_leaf=3, min_samples_split=3, n_estimators=100), 
            XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=5, n_estimators=200), 
            LGBMClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=3, n_estimators=100)]
    
    for model_idx, clf in enumerate(clfs):
        clf.fit(x_train, y_train)
        
        new_x_train_list[model_idx][val_idx, :] = clf.predict_proba(x_valid)[:, 1].reshape(-1, 1)
        new_x_test_list[model_idx][:] += clf.predict_proba(X_test)[:, 1].reshape(-1, 1) / n_splits

Fold 0 Start
Fold 1 Start
Fold 2 Start
Fold 3 Start
Fold 4 Start


In [138]:
new_x_train_list

[array([[0.49376718],
        [0.00732824],
        [0.00519227],
        ...,
        [0.02395131],
        [0.01862802],
        [0.0028776 ]]), array([[0.3333572 ],
        [0.02738925],
        [0.02032642],
        ...,
        [0.10603042],
        [0.03774399],
        [0.01027037]]), array([[4.00399953e-01],
        [2.27430311e-04],
        [3.96051793e-04],
        ...,
        [2.95416694e-02],
        [1.38997380e-02],
        [1.19783112e-03]]), array([[0.34725056],
        [0.00262793],
        [0.00305666],
        ...,
        [0.056839  ],
        [0.02229105],
        [0.0035949 ]])]

In [139]:
new_x_test_list

[array([[0.01223078],
        [0.6285043 ],
        [0.00392437],
        ...,
        [0.11047058],
        [0.27313894],
        [0.00647557]]), array([[0.02499106],
        [0.40293896],
        [0.01886291],
        ...,
        [0.15012637],
        [0.24072328],
        [0.02159666]]), array([[0.00563361],
        [0.36332088],
        [0.00066725],
        ...,
        [0.01375558],
        [0.25483002],
        [0.00182497]]), array([[0.01815481],
        [0.45455368],
        [0.00255925],
        ...,
        [0.05293122],
        [0.27285783],
        [0.01126587]])]

In [140]:
new_train = pd.DataFrame(np.concatenate(new_x_train_list, axis=1), columns=None)
new_label = label
new_test = pd.DataFrame(np.concatenate(new_x_test_list, axis=1), columns=None)

new_train.shape, new_label.shape, new_test.shape

((26049, 4), (26049,), (6512, 4))

2) 2stage meta model 학습

In [142]:
val_scores = list()
oof_pred = np.zeros((test.shape[0], ))

for i, (trn_idx, val_idx) in enumerate(skf.split(new_train, new_label)):
    x_train, y_train = new_train.iloc[trn_idx, :], new_label[trn_idx]
    x_valid, y_valid = new_train.iloc[val_idx, :], new_label[val_idx]
    
    # 전처리
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)
    x_test  = scaler.transform(new_test)
    
    # 모델 정의
    clf = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=5, n_estimators=200)
    
    # 모델 학습
    clf.fit(x_train, y_train,
            eval_set = [[x_valid, y_valid]], 
            eval_metric = xgb_f1,        
            early_stopping_rounds = 100,
            verbose = 100,  )

    # 훈련, 검증 데이터 F1 Score 확인
    trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
    val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
    print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
    val_scores.append(val_f1_score)
    
    oof_pred += clf.predict_proba(x_test)[:, 1] / n_splits
    

# 교차 검증 F1 Score 평균 계산하기
print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

[0]	validation_0-error:0.133589	validation_0-f1:0.866411
Multiple eval metrics have been passed: 'validation_0-f1' will be used for early stopping.

Will train until validation_0-f1 hasn't improved in 100 rounds.
[100]	validation_0-error:0.132054	validation_0-f1:0.867946
Stopping. Best iteration:
[0]	validation_0-error:0.133589	validation_0-f1:0.866411

0 Fold, train f1_score : 0.87104, validation f1_score : 0.8664

[0]	validation_0-error:0.141267	validation_0-f1:0.858733
Multiple eval metrics have been passed: 'validation_0-f1' will be used for early stopping.

Will train until validation_0-f1 hasn't improved in 100 rounds.
[100]	validation_0-error:0.134357	validation_0-f1:0.865643
Stopping. Best iteration:
[0]	validation_0-error:0.141267	validation_0-f1:0.858733

1 Fold, train f1_score : 0.86984, validation f1_score : 0.8587

[0]	validation_0-error:0.135701	validation_0-f1:0.864299
Multiple eval metrics have been passed: 'validation_0-f1' will be used for early stopping.

Will train 