In [1]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.hashing import HashingEncoder

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

pd.set_option("display.max_columns", 30)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']

In [4]:
from sklearn.impute import SimpleImputer 

imputer_bin = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imputer_bin.fit(train[bin])

In [5]:
train_imp = train.copy()
train_imp[bin] = imputer_bin.transform(train[bin])

test_imp = test.copy()
test_imp[bin] = imputer_bin.transform(test[bin])

In [6]:
train_imp['nom_0'] = train_imp['nom_0'].fillna(train_imp['nom_0'].mode()[0])
train_imp['nom_1'] = train_imp['nom_1'].fillna(train_imp['nom_1'].mode()[0])
train_imp['nom_2'] = train_imp['nom_2'].fillna(train_imp['nom_2'].mode()[0])
train_imp['nom_3'] = train_imp['nom_3'].fillna(train_imp['nom_3'].mode()[0])
train_imp['nom_4'] = train_imp['nom_4'].fillna(train_imp['nom_4'].mode()[0])
train_imp['nom_5'] = train_imp['nom_5'].fillna(-1)
train_imp['nom_6'] = train_imp['nom_6'].fillna(-1)
train_imp['nom_7'] = train_imp['nom_7'].fillna(-1)
train_imp['nom_8'] = train_imp['nom_8'].fillna(-1)
train_imp['nom_9'] = train_imp['nom_9'].fillna(-1)

In [7]:
test_imp['nom_0'] = test_imp['nom_0'].fillna(test_imp['nom_0'].mode()[0])
test_imp['nom_1'] = test_imp['nom_1'].fillna(test_imp['nom_1'].mode()[0])
test_imp['nom_2'] = test_imp['nom_2'].fillna(test_imp['nom_2'].mode()[0])
test_imp['nom_3'] = test_imp['nom_3'].fillna(test_imp['nom_3'].mode()[0])
test_imp['nom_4'] = test_imp['nom_4'].fillna(test_imp['nom_4'].mode()[0])
test_imp['nom_5'] = test_imp['nom_5'].fillna(-1)
test_imp['nom_6'] = test_imp['nom_6'].fillna(-1)
test_imp['nom_7'] = test_imp['nom_7'].fillna(-1)
test_imp['nom_8'] = test_imp['nom_8'].fillna(-1)
test_imp['nom_9'] = test_imp['nom_9'].fillna(-1)

In [8]:
train_imp['ord_0'] = train_imp['ord_0'].fillna(train_imp['ord_0'].mode()[0])
train_imp['ord_1'] = train_imp['ord_1'].fillna(train_imp['ord_1'].mode()[0])
train_imp['ord_2'] = train_imp['ord_2'].fillna(train_imp['ord_2'].mode()[0])
train_imp['ord_3'] = train_imp['ord_3'].fillna('결측치')
train_imp['ord_4'] = train_imp['ord_4'].fillna('결측치')
train_imp['ord_5'] = train_imp['ord_5'].fillna('결측치')

In [9]:
test_imp['ord_0'] = test_imp['ord_0'].fillna(test_imp['ord_0'].mode()[0])
test_imp['ord_1'] = test_imp['ord_1'].fillna(test_imp['ord_1'].mode()[0])
test_imp['ord_2'] = test_imp['ord_2'].fillna(test_imp['ord_2'].mode()[0])
test_imp['ord_3'] = test_imp['ord_3'].fillna('결측치')
test_imp['ord_4'] = test_imp['ord_4'].fillna('결측치')
test_imp['ord_5'] = test_imp['ord_5'].fillna('결측치')

In [10]:
train_imp['day'] = train_imp['day'].fillna(train_imp['day'].mode()[0])
train_imp['month'] = train_imp['month'].fillna(train_imp['month'].mode()[0])

In [11]:
test_imp['day'] = test_imp['day'].fillna(test_imp['day'].mode()[0])
test_imp['month'] = test_imp['month'].fillna(test_imp['month'].mode()[0])

In [12]:
all_data = pd.concat([train_imp, test_imp], ignore_index = True)
all_data.drop(columns = ['target', 'id'], axis = 1, inplace = True)

train_target = train['target']
all_data

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0
1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,India,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0
2,0.0,1.0,0.0,F,N,Red,Triangle,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,-1,3.0,Novice,Freezing,n,P,eN,5.0,9.0
3,0.0,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,결측치,3.0,3.0
4,0.0,0.0,0.0,T,N,Red,Triangle,Hamster,Costa Rica,Theremin,777d1ac2c,3a7975e46,bc9cc2a94,-1,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.0,0.0,1.0,T,N,Red,Trapezoid,Axolotl,Costa Rica,Theremin,92fb4c0dc,633bdcfd0,3ef3018d3,c1a4acfaf,45a68dd2e,2.0,Expert,Warm,n,R,dp,2.0,12.0
999996,0.0,0.0,0.0,F,N,Red,Polygon,Dog,Russia,Theremin,8bd03e713,7c241cd40,f74f0b894,220190c9e,e48348d66,1.0,Contributor,Freezing,n,X,US,1.0,5.0
999997,0.0,0.0,1.0,T,N,Red,Triangle,Hamster,India,Theremin,6924d999b,18d43aee8,3afd3697d,cc5495ab3,47021df0d,1.0,Grandmaster,Boiling Hot,m,P,TL,1.0,6.0
999998,0.0,0.0,0.0,F,Y,Red,Circle,Dog,Costa Rica,Theremin,3e0230528,0c073adc7,3600c6e91,32b33a4b4,e4bf32721,3.0,Novice,Lava Hot,h,L,DI,2.0,11.0


In [14]:
import category_encoders as ce
from sklearn.model_selection import train_test_split

woe_data = all_data.copy()
woe_train = woe_data[:len(train)]
woe_test = woe_data[len(train):]

## 인코딩

enc =ce.WOEEncoder()
woe_encoded = enc.fit_transform(woe_train, train_target)
woe_test_encoded = enc.transform(woe_test)

sub = pd.read_csv('sample_submission.csv')

In [15]:
from sklearn.linear_model import LogisticRegression

# 전체 데이터로 학습
model = LogisticRegression(random_state = 99)
model.fit(woe_encoded, train_target)

# test data에 대해 예측 
y_pred = model.predict_proba(woe_test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('LR_woe.csv', index = False)
## 결과 LR_woe = 0.50062


In [17]:
from sklearn.tree import DecisionTreeClassifier

# 전체 데이터로 학습
model = DecisionTreeClassifier(max_depth = 9, random_state = 99)
model.fit(woe_encoded, train_target)

# test data에 대해 예측 
y_pred = model.predict_proba(woe_test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('DT_woe.csv', index = False)
## 결과 DT_woe = 0.71569

In [18]:
# 전체 데이터로 학습
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state = 99)
model.fit(woe_encoded, train_target)

# test data에 대해 예측 
y_pred = model.predict_proba(woe_test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('MLP_woe.csv', index = False)
## 결과 MLP_woe = 0.5