In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score

import os

In [None]:
# 데이터 불러오기
train = pd.read_csv('./kakr-4th-competition/train.csv')
test = pd.read_csv('./kakr-4th-competition/test.csv')
sample_submission = pd.read_csv('./kakr-4th-competition/sample_submission.csv')

# 1. 데이터 변형
## 1-1. Categorical
1. column 제거: id, education, relationship, native_country, workclass
2. marital_status 조정: married_civ_spouse -> 1 나머지는 0
3. occupation, sex, race: onehot encoding (여기선 income도 encoding되서 income 변수 1 or 0으로 변경함

In [None]:
# 1) columns 제거
train.drop(['id','education','relationship','native_country','workclass'], axis=1, inplace=True)
test.drop(['id','education','relationship','native_country','workclass'], axis=1, inplace=True)

In [None]:
# 2) marital_status 조정
## married_civ_spouse -> 1 나머지는 0
train['marital_status'] = (train['marital_status'] == 'Married-civ-spouse').astype(int)
test['marital_status'] = (test['marital_status'] == 'Married-civ-spouse').astype(int)

## 1-2. numerical
1. capital_gain, capital_loss: log 처리하고, 새 변수 생성
    - cap_gain_high: capital_gain이 0이 아니면 1, 0이면 0
    - cap_loss_high: capital_loss가 1700보다 크면 1, 작으면 0
2. age: categorical해서 onehot encoding (0\~20 / 20\~65를 5세 단위로 / 65~)
3. fnlwgt: logistic modeling할 때는 필요한데, 다른 곳에는 쓸 여지가 없다. (연기)
4. education_num: MinMax Scaler, 새 변수 생성
    - edu_num_high: 13이상이면 1, 아니면 0
5. hours_per_week: MinMax Scaler, 새 변수 생성
    - hpw_high: 50이상이면 1, 아니면 0

In [None]:
# 1) capital_gain, capital_loss: log 처리 + 새 변수 생성
train['cap_gain_high'] = (train['capital_gain'] != 0).astype(int)
test['cap_gain_high'] = (test['capital_gain'] != 0).astype(int)

train['cap_loss_high'] = (train['capital_loss'] >= 1700).astype(int)
test['cap_loss_high'] = (test['capital_loss'] >= 1700).astype(int)

train['capital_gain'] = train['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
test['capital_gain']  = test['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)

In [None]:
# 2) age
train.loc[train['age'] < 20, 'age_range'] = '~20'
train.loc[train['age'] >= 65, 'age_range'] = '~65'
test.loc[test['age'] < 20, 'age_range'] = '~20'
test.loc[test['age'] >= 65, 'age_range'] = '~65'

down = 20
for i in range(45//5):
    train.loc[(train['age'] >= down) & (train['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
    test.loc[(test['age'] >= down) & (test['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
    down += 5

train['age'] = train['age_range']
train.drop(['age_range'], axis=1, inplace=True)
test['age'] = test['age_range']
test.drop(['age_range'], axis=1, inplace=True)

In [None]:
# 3) Categorical: One-hot encoding
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [None]:
# 4) fnlwgt

In [None]:
# 5) education_num
mm_scaler = MinMaxScaler()

train['edu_num_high'] = (train['education_num'] >= 13).astype(int)
test['edu_num_high']  = (test['education_num'] >= 13).astype(int)

train['education_num'] = mm_scaler.fit_transform(train['education_num'].values.reshape(-1,1))
test['education_num'] = mm_scaler.fit_transform(test['education_num'].values.reshape(-1,1))

In [None]:
# 6) hours_per_week
train['hpw_high'] = (train['hours_per_week'] >= 50).astype(int)
test['hpw_high']  = (test['hours_per_week'] >= 50).astype(int)

train['hours_per_week'] = mm_scaler.fit_transform(train['hours_per_week'].values.reshape(-1,1))
test['hours_per_week'] = mm_scaler.fit_transform(test['hours_per_week'].values.reshape(-1,1))

In [None]:
# 7) target value 하나만 나오게 수정 + 분리
train['income'] = train['income_>50K']
train.drop(['income_>50K','income_<=50K'], axis=1, inplace=True)

y_train = train.income
X_train = train.drop(['income'], axis=1, inplace=False)

In [None]:
X_test = test

# 2. 모델링: 기냥 해봄

In [None]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
y_test_predict = dt_clf.predict(X_test).astype(int)

In [None]:
sample_submission['prediction'] = dt_clf.predict(X_test).astype(int)
sample_submission.to_csv('submission3.csv', index=False)

# 3. 함수화
- 일단은 Preprocessing 과정만 수행

In [None]:
# 1) column 제거
def col_reduction(df):
    df.drop(['id', 'fnlwgt', 'education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    return df

# 2) marital_status 조정
def mar_st(df):
    df['marital_status'] = (df['marital_status'] == 'Married-civ-spouse').astype(int)
    
    return df

# 3) capital_gain, loss 조정
def capital(df):
    df['cap_gain_high'] = (df['capital_gain'] != 0).astype(int)
    df['cap_loss_high'] = (df['capital_loss'] >= 1700).astype(int)
    df['capital_gain'] = df['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    return df

# 4) age 조정 함수
def age(df):
    df.loc[df['age'] < 20, 'age_range'] = '~20'
    df.loc[df['age'] >= 65, 'age_range'] = '~65'

    down = 20
    for i in range(45//5):
        df.loc[(df['age'] >= down) & (df['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5

    df['age'] = df['age_range']
    df.drop(['age_range'], axis=1, inplace=True)
    
    return df
    
# 5) One-hot encoding은 만들지 않았다.

# 6) edu_num, hpw 조정 (Minmax scaler 때문에 합쳐서 해봤다.)
def mm_features(df):
    mm_scaler = MinMaxScaler()
    
    df['edu_num_high'] = (df['education_num'] >= 13).astype(int)
    df['education_num'] = mm_scaler.fit_transform(df['education_num'].values.reshape(-1,1))
    
    df['hpw_high'] = (df['hours_per_week'] >= 50).astype(int)
    df['hours_per_week'] = mm_scaler.fit_transform(df['hours_per_week'].values.reshape(-1,1))
    
    return df

# 7) target 분리: train은 하고, test는 안하므로 따로 만들겠다.
def target_handle(df):
    df['income'] = df['income_>50K']
    df.drop(['income_>50K','income_<=50K'], axis=1, inplace=True)
    
    y_df = df.income
    X_df = df.drop(['income'], axis=1, inplace=False)
    
    return X_df, y_df

def main(df):
    
    df1 = col_reduction(df)
    df2 = mar_st(df1)
    df3 = capital(df2)
    df4 = age(df3)
    
    df5 = pd.get_dummies(df4)
    
    df_fin = mm_features(df5)
    
    return df_fin

In [None]:
# 적용
train = main(train)
X_train, y_train = target_handle(train)
X_test = main(test)