In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score

import os

In [None]:
# 데이터 불러오기
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/test.csv')
# sample_submission = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/sample_submission.csv')

# 함수화

In [None]:
# 1) column 제거
def col_reduction(df):
    df.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    return df

# 2) marital_status 조정
def mar_st(df):
    df['marital_status'] = (df['marital_status'] == 'Married-civ-spouse').astype(int)
    
    return df

# 3) race 조정
def race(df):
    df['race'] = ((train['race'] == 'White') | (train['race'] == 'Asian-Pac-Islander')).astype(int)

    return df

# 4) capital_gain, loss 조정
def capital(df):
    df['cap_gain_high'] = (df['capital_gain'] != 0).astype(int)
    df['cap_loss_high'] = (df['capital_loss'] >= 1700).astype(int)
    df['capital_gain'] = df['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    return df

# 5) age 조정 함수
def age(df):
    df.loc[df['age'] < 20, 'age_range'] = '~20'
    df.loc[df['age'] >= 65, 'age_range'] = '~65'

    down = 20
    for i in range(45//5):
        df.loc[(df['age'] >= down) & (df['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5

    df['age'] = df['age_range']
    df.drop(['age_range'], axis=1, inplace=True)
    
    return df
    
# 6) One-hot encoding은 만들지 않았다.

# 7) edu_num 새 변수 만들기
def edu(df):
    df['edu_num_high'] = (df['education_num'] >= 13).astype(int)
    
    return df

# 8) hpw 새 변수 만들기
    
def hpw(df):
    df['hpw_high'] = (df['hours_per_week'] >= 50).astype(int)

    return df

# 9) MinMaxScaler
def mm_feature(df, feature):
    mm_scaler = MinMaxScaler()
    
    df[feature] = mm_scaler.fit_transform(df[feature].values.reshape(-1,1))
    
    return df, mm_scaler

# 10) target 분리: train은 하고, test는 안하므로 따로 만들겠다.
def target_handle(df):
    df['income'] = df['income_>50K']
    df.drop(['income_>50K','income_<=50K'], axis=1, inplace=True)
    
    y_df = df.income
    X_df = df.drop(['income'], axis=1, inplace=False)
    
    return X_df, y_df

def main(df):
    
    df1 = col_reduction(df)
    df2 = mar_st(df1)
    df3 = race(df2)
    df4 = capital(df3)
    df5 = age(df4)
    
    df6 = pd.get_dummies(df5)
    
    df7 = edu(df6)
    df_fin = hpw(df7)
    
    return df_fin

In [None]:
# 적용
## main: 1) ~ 7)
train = main(train)
X_test = main(test)

## 8) minmax scaler
train, mm_scaler1 = mm_feature(train,'education_num')
train, mm_scaler2 = mm_feature(train,'hours_per_week')

X_test['education_num'] = mm_scaler1.transform(X_test['education_num'].values.reshape(-1,1))
X_test['hours_per_week'] = mm_scaler2.transform(X_test['hours_per_week'].values.reshape(-1,1))

## 9) X, y split
X_train, y_train = target_handle(train)

In [None]:
# 데이터 전처리 기존과 동일
# 학습용 데이터 분할처리 (8:2)

from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                     test_size=.2,
                                                     random_state = 42,
                                                     shuffle=True,
                                                     stratify = y_train)

In [None]:
print(x_train.shape)
print(y_train.shape)
print('='*50)
print(x_valid.shape)
print(y_valid.shape)

(20839, 38)
(20839,)
(5210, 38)
(5210,)


# ML 모델 적용하기

In [None]:
# XGBoost 모델 사용

import xgboost as xgb

xgb_model = xgb.XGBClassifier()
xgb_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(x_valid)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

f1 = f1_score(y_valid, y_pred, average='micro')
print('-F1 Score: ', f1)
# print(f"XGBClassifier\n -F1 Score: {f1_score(y_valid, y_pred, average='micro')}")

accuracy = accuracy_score(y_valid, y_pred)
print('-Accuracy score: ', accuracy)

-F1 Score:  0.8593090211132438
-Accuracy score:  0.8593090211132438


### XGBoost 알고리즘의 개념 이해
XGBoost는 Gradient Boosting 알고리즘을 분산환경에서도 실행할 수 있도록 구현해놓은 라이브러리이다. 

즉, 앙상블 부스팅(ensemble boosting)의 특징인 가중치 부여를 경사하강법(gradient descent)으로 한다

* xgboost의 특징

    - gbm보다는 빠르다. (gbm보다 빠른 것입니다.)
    - 과적합(overfitting) 방지가 가능한 규제가 포함되어 있다.
    - CART(Classification And Regression Tree)를 기반으로 한다. 즉, 분류와 회귀가 둘 다 가능하다
    - 조기 종료(early stopping)을 제공한다.

#### xgboost의 하이퍼파라미터(xgboost hyperparameter)
https://xgboost.readthedocs.io/en/latest/parameter.html
    
- n_estimators(혹은 num_boost_round) : 결정 트리의 개수
- max_depth : 트리의 깊이
- colsample_bytree : 컬럼의 샘플링 비율(random forest의 max_features와 비슷)
- subsample : weak learner가 학습에 사용하는 데이터 샘플링 비율
- learning_rete : 학습률
- min_split_loss :  리프 노드를 추가적으로 나눌지 결정하는 값
- reg_lambda : L2 규제
- reg_alpha : L1 규제

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# 파라미터 튜닝 (GridSearchCV)


kf = KFold(n_splits=5)
xgb_model = xgb.XGBClassifier()

param_grid={'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}

# param_grid={'max_depth':range(3,10,3), 'min_child_weight':range(1,6,2), 'gamma':[i/10.0 for i in range(0,5)], 
#             'subsample':[i/10.0 for i in range(6,10)], 'colsample_bytree':[i/10.0 for i in range(6,10)],
#             'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}

grid_sv = GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.05, n_estimators=1000,
                                                   max_depth=3, min_child_weight=5,
                                                   gamma=0.4, subsample=0.7,
                                                   colsample_bytree=0.7, reg_alpha=1e-05,
                        objective= 'binary:logistic'), 
                       param_grid=param_grid, scoring='neg_mean_squared_error')
grid_sv.fit(x_train, y_train )
print("Best 파라미터 :", grid_sv.best_params_)

 Best 파라미터 : 
    {'max_depth': 3}
    {'min_child_weight': 5}
    {'gamma': 0.4}
    {'subsample': 0.7}
    {'colsample_bytree': 0.7}
    {'learning_rate': 0.05}
    {'reg_alpha': 1e-05}

In [None]:
params = grid_sv.best_params_

model = xgb.XGBClassifier(**params)
model.fit(x_train, y_train)

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

In [None]:
xgb1 = xgb.XGBClassifier(
    learning_rate =0.05,
    n_estimators=1000,
    max_depth=3,
    min_child_weight=5,
    gamma=0.4,
    subsample=0.7,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    nthread=-1,
    reg_alpha=1e-05,
    booster='gbtree',     
    scale_pos_weight=1)

In [None]:
xgb1.fit(x_train, y_train)

y_pred1 = xgb1.predict(x_valid)
f1 = f1_score(y_valid, y_pred1, average='micro')
print(f1)

0.8708253358925144


In [None]:
xgb2 = xgb.XGBClassifier(
    learning_rate =0.05,
    n_estimators=1000,
    max_depth=8,
    min_child_weight=3,
    gamma=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1)

In [None]:
xgb2.fit(x_train, y_train)

y_pred2 = xgb2.predict(x_valid)
f1 = f1_score(y_valid, y_pred1, average='micro')
print(f1)

0.8708253358925144


In [None]:
xgb3 = xgb.XGBClassifier(booster='gbtree', 
    learning_rate =0.05,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=3,
    gamma=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=-1,
    n_jobs=3,
    scale_pos_weight=1)

In [None]:
xgb3.fit(x_train, y_train)

y_pred3 = xgb3.predict(x_valid)
f1 = f1_score(y_valid, y_pred1, average='micro')
print(f1)

0.8717850287907869


### 제출결과 만들기

In [None]:
prediction = xgb1.predict(X_test)

In [None]:
submit = pd.read_csv("~/Downloads/kakr-4th-competition/sample_submission.csv")

In [None]:
submit['prediction'] = prediction
submit.head()

Unnamed: 0,id,prediction
0,0,0
1,1,0
2,2,0
3,3,1
4,4,1


In [None]:
submit.to_csv('submission.csv', index=False)
# index=False를 하지 않으면 화면에 보이는 인덱스가 그대로 들어간다. 