In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score

import os

In [None]:
# 데이터 불러오기
train = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/train.csv')
test = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/test.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/sample_submission.csv')

In [None]:
# 1) column 제거
def col_reduction(df):
    df.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    return df

# 2) marital_status 조정
def mar_st(df):
    df['marital_status'] = (df['marital_status'] == 'Married-civ-spouse').astype(int)
    
    return df

# 3) race 조정
def race(df):
    df['race'] = ((train['race'] == 'White') | (train['race'] == 'Asian-Pac-Islander')).astype(int)

    return df

# 4) capital_gain, loss 조정
def capital(df):
    df['cap_gain_high'] = (df['capital_gain'] != 0).astype(int)
    df['cap_loss_high'] = (df['capital_loss'] >= 1700).astype(int)
    df['capital_gain'] = df['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    return df

# 5) age 조정 함수
def age(df):
    df.loc[df['age'] < 20, 'age_range'] = '~20'
    df.loc[df['age'] >= 65, 'age_range'] = '~65'

    down = 20
    for i in range(45//5):
        df.loc[(df['age'] >= down) & (df['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5

    df['age'] = df['age_range']
    df.drop(['age_range'], axis=1, inplace=True)
    
    return df
    
# 6) One-hot encoding은 만들지 않았다.

# 7) edu_num 새 변수 만들기
def edu(df):
    df['edu_num_high'] = (df['education_num'] >= 13).astype(int)
    
    return df

# 8) hpw 새 변수 만들기
    
def hpw(df):
    df['hpw_high'] = (df['hours_per_week'] >= 50).astype(int)

    return df

# 9) MinMaxScaler
def mm_feature(df, feature):
    mm_scaler = MinMaxScaler()
    
    df[feature] = mm_scaler.fit_transform(df[feature].values.reshape(-1,1))
    
    return df, mm_scaler

# 10) target 분리: train은 하고, test는 안하므로 따로 만들겠다.
def target_handle(df):
    df['income'] = df['income_>50K']
    df.drop(['income_>50K','income_<=50K'], axis=1, inplace=True)
    
    y_df = df.income
    X_df = df.drop(['income'], axis=1, inplace=False)
    
    return X_df, y_df

def main(df):
    
    df1 = col_reduction(df)
    df2 = mar_st(df1)
    df3 = race(df2)
    df4 = capital(df3)
    df5 = age(df4)
    
    df6 = pd.get_dummies(df5)
    
    df7 = edu(df6)
    df_fin = hpw(df7)
    
    return df_fin

In [None]:
# 적용
## main: 1) ~ 7)
train = main(train)
X_test = main(test)

## 8) minmax scaler
train, mm_scaler1 = mm_feature(train,'education_num')
train, mm_scaler2 = mm_feature(train,'hours_per_week')

X_test['education_num'] = mm_scaler1.transform(X_test['education_num'].values.reshape(-1,1))
X_test['hours_per_week'] = mm_scaler2.transform(X_test['hours_per_week'].values.reshape(-1,1))

## 9) X, y split
X_train, y_train = target_handle(train)

In [None]:
X_train

Unnamed: 0,education_num,marital_status,capital_gain,capital_loss,hours_per_week,cap_gain_high,cap_loss_high,age_20~25,age_25~30,age_30~35,age_35~40,age_40~45,age_45~50,age_50~55,age_55~60,age_60~65,age_~20,age_~65,occupation_?,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,edu_num_high,hpw_high
10431,0.533333,0,0.0,0,0.397959,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
10152,0.200000,0,0.0,0,0.500000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
5435,0.400000,0,0.0,0,0.316327,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
17294,0.600000,1,0.0,0,0.551020,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1
737,0.800000,0,0.0,0,0.346939,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22267,0.866667,0,0.0,0,0.397959,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0
23542,0.600000,0,0.0,0,0.397959,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
17890,0.533333,1,0.0,0,0.397959,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
26004,0.066667,1,0.0,0,0.397959,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [None]:
# crosstab 사용해서 value별로 갯수 파악하기
pd.crosstab(train['income'],train['race'])

race,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<=50K,225,585,2174,186,16574
>50K,29,215,302,18,5741


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  shuffle=True,
                                                  stratify=y_train)

In [None]:
clf_DT = DecisionTreeClassifier()
clf_DT

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
clf_RF = RandomForestClassifier()
clf_RF

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

**max_leaf_nodes**

: 최대 몇개 잎 노드가 만들어 질때 까지 split(하위 (잎) 노드로 분리) 할 것인지를 설정한다. 값이 작을수록 가지치기를 하고 overfitting을 방지할 수 있다.

**max_depth**

: Decision Tree의 최대 깊이 제한을 줄 수 있는 Parameter이다. 기본 값은 None이며 None일 때는 모든 잎이 min_sample_split보다 작거나 불순도가 0일 때까지 노드가 확장된다. max_depth가 작을수록 가지치기를 하고 overfitting을 방지할 수 있다.

**min_samples_split**

: 샘플이 최소한 몇 개 이상이어야 split할건지 제한을 줄 수 있는 Parameter이다. 값이 클수록 가지치기하고 과대적합 방지한다.

Parameter의 주어진 값에 type에 따라 다음과 같이 기능한다.
int일 경우, 주어진 값을 그대로 사용한다.
float일 경우, 0에서 1사이의 값을 줄 수 있으며 ceil(전체 데이터 수*min_sample_split)의 값을 사용한다.

**min_samples_leaf**

: 한 노드에서 가지고 있어야 할 최소 sample 개수에 대한 제한을 줄 수 있는 Parameter이다. 값이 클수록 가지치기를 하고 과대적합 방지한다.

Parameter의 주어진 값에 type에 따라 다음과 같이 기능한다.
int일 경우, 주어진 값을 그대로 사용한다.
float일 경우, 0에서 1사이의 값을 줄 수 있으며 ceil(전체 데이터 수*min_samples_leaf)의 값을 사용한다.

**min_impurity_decrease**

: 해당 split이 이 값보다 크거나 같은 불순도의 감소를 유발한다면 노드는 split될 것이다.

가중 불순도 감소 방정식은 다음과 같다.

N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity)

여기서 N은 총 샘플 수, N_t는 현재 노드의 샘플 수, N_t_L은 왼쪽 노드에 있는 샘플 수, N_t_R은 오른쪽 노드에 있는 샘플 수이다.

sample_weight가 통과된 경우 N, N_t, N_t_R, N_t_L 모두 가중 합계를 참조한다.

In [None]:
clf_DT.fit(X_train, y_train)
y_pred_DT = clf_DT.predict(X_val)

In [None]:
clf_RF.fit(X_train, y_train)
y_pred_RF = clf_RF.predict(X_val)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
f1_DT = f1_score(y_val, y_pred_DT, average='micro')
print('F1_DT Score: ', f1_DT)

accuracy_DT = accuracy_score(y_val, y_pred_DT)
print('Accuracy_DT score: ', accuracy_DT)

F1_DT Score:  0.8339731285988483
Accuracy_DT score:  0.8339731285988484


In [None]:
f1_RF = f1_score(y_val, y_pred_RF, average='micro')
print('F1_RF Score: ', f1_RF)

accuracy_RF = accuracy_score(y_val, y_pred_RF)
print('Accuracy_RF score: ', accuracy_RF)

F1_RF Score:  0.8485604606525912
Accuracy_RF score:  0.8485604606525912


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth' : [6, 8, 10, 12, 16, 20, 24], 'min_samples_split' : [16, 24]}
grid = GridSearchCV(clf_DT, param_grid)
grid.fit(X_train, y_train)
best_param = grid.best_params_
best_param

{'max_depth': 6, 'min_samples_split': 24}

In [None]:
param_grid = {'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]}
grid = GridSearchCV(clf_RF, param_grid)
grid.fit(X_train, y_train)
best_param = grid.best_params_
best_param

{'max_depth': 12,
 'min_samples_leaf': 8,
 'min_samples_split': 16,
 'n_estimators': 100}

In [None]:
clf_DT = DecisionTreeClassifier(max_depth=6, min_samples_split=24)
clf_DT.fit(X_train, y_train)

y_pred_DT = clf_DT.predict(X_val)
f1_DT = f1_score(y_val, y_pred_DT, average='micro')
print(f1_DT)

0.8500959692898272


In [None]:
clf_RF = RandomForestClassifier(max_depth=12, min_samples_leaf=8, min_samples_split=16, n_estimators=100)
clf_RF.fit(X_train, y_train)

y_pred_RF = clf_RF.predict(X_val)
f1_RF = f1_score(y_val, y_pred_RF, average='micro')
print(f1_RF)

0.8579654510556622
