In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [59]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score

import os

In [60]:
# 데이터 불러오기
train = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/train.csv')
test = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/test.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/[2020]_데이터분석캠프(캐글코리아)/data/sample_submission.csv')

In [61]:
# 1) column 제거
def col_reduction(df):
    df.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    return df

# 2) marital_status 조정
def mar_st(df):
    df['marital_status'] = (df['marital_status'] == 'Married-civ-spouse').astype(int)
    
    return df

# 3) capital_gain, loss 조정
def capital(df):
    df['cap_gain_high'] = (df['capital_gain'] != 0).astype(int)
    df['cap_loss_high'] = (df['capital_loss'] >= 1700).astype(int)
    df['capital_gain'] = df['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    return df

# 4) age 조정 함수
def age(df):
    df.loc[df['age'] < 20, 'age_range'] = '~20'
    df.loc[df['age'] >= 65, 'age_range'] = '~65'

    down = 20
    for i in range(45//5):
        df.loc[(df['age'] >= down) & (df['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5

    df['age'] = df['age_range']
    df.drop(['age_range'], axis=1, inplace=True)
    
    return df
    
# 5) One-hot encoding은 만들지 않았다.

# 6) edu_num 새 변수 만들기
def edu(df):
    df['edu_num_high'] = (df['education_num'] >= 13).astype(int)
    
    return df

# 7) hpw 새 변수 만들기
    
def hpw(df):
    df['hpw_high'] = (df['hours_per_week'] >= 50).astype(int)

    return df

# 8) MinMaxScaler
def mm_feature(df, feature):
    mm_scaler = MinMaxScaler()
    
    df[feature] = mm_scaler.fit_transform(df[feature].values.reshape(-1,1))
    
    return df, mm_scaler

# 9) target 분리: train은 하고, test는 안하므로 따로 만들겠다.
def target_handle(df):
    df['income'] = df['income_>50K']
    df.drop(['income_>50K','income_<=50K'], axis=1, inplace=True)
    
    y_df = df.income
    X_df = df.drop(['income'], axis=1, inplace=False)
    
    return X_df, y_df

def main(df):
    
    df1 = col_reduction(df)
    df2 = mar_st(df1)
    df3 = capital(df2)
    df4 = age(df3)
    
    df5 = pd.get_dummies(df4)
    
    df6 = edu(df5)
    df_fin = hpw(df6)
    
    return df_fin

In [62]:
# 적용
## main: 1) ~ 7)
train = main(train)
X_test = main(test)

## 8) minmax scaler
train, mm_scaler1 = mm_feature(train,'education_num')
train, mm_scaler2 = mm_feature(train,'hours_per_week')

X_test['education_num'] = mm_scaler1.transform(X_test['education_num'].values.reshape(-1,1))
X_test['hours_per_week'] = mm_scaler2.transform(X_test['hours_per_week'].values.reshape(-1,1))

## 9) X, y split
X_train, y_train = target_handle(train)

In [63]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [64]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=5, shuffle=True)

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [66]:
clf_DT = DecisionTreeClassifier()
score = cross_val_score(clf_DT, X_train, y_train, scoring='f1', cv=k_fold)
print(score)
print(score.mean())

[0.6308094  0.62637363 0.61812118 0.62133891 0.62663185]
0.6246549940407375


In [67]:
clf_RF = RandomForestClassifier()
score = cross_val_score(clf_RF, X_train, y_train, scoring='f1', cv=k_fold)
print(score)
print(score.mean())

[0.65458422 0.63333333 0.64809855 0.6656426  0.64120495]
0.6485727317870272


In [68]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth' : [6, 8, 10, 12, 16, 20, 24], 'min_samples_split' : [16, 24]}
grid = GridSearchCV(clf_DT, param_grid, cv=k_fold)
grid.fit(X_train, y_train)
best_param = grid.best_params_
best_param

{'max_depth': 12, 'min_samples_split': 24}

In [69]:
param_grid = {'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]}
grid = GridSearchCV(clf_RF, param_grid, cv=k_fold)
grid.fit(X_train, y_train)
best_param = grid.best_params_
best_param

{'max_depth': 12,
 'min_samples_leaf': 8,
 'min_samples_split': 8,
 'n_estimators': 100}

In [72]:
clf_DT = DecisionTreeClassifier(max_depth=12, min_samples_split=24)
score = cross_val_score(clf_DT, X_train, y_train, scoring='f1', cv=k_fold)
print(score)
print(score.mean())

[0.62633452 0.66554054 0.63071512 0.6171143  0.61677731]
0.6312963590314764


In [73]:
clf_RF = RandomForestClassifier(max_depth=12, min_samples_leaf=8, min_samples_split=8, n_estimators=100)
score = cross_val_score(clf_RF, X_train, y_train, scoring='f1', cv=k_fold)
print(score)
print(score.mean())

[0.64430307 0.62470309 0.61189454 0.6446384  0.65318263]
0.6357443459687986
