In [1]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

pd.set_option("display.max_columns", 30)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']

In [4]:
from sklearn.impute import SimpleImputer 

imputer_bin = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imputer_bin.fit(train[bin])

In [5]:
train_imp = train.copy()
train_imp[bin] = imputer_bin.transform(train[bin])

test_imp = test.copy()
test_imp[bin] = imputer_bin.transform(test[bin])

In [6]:
train_imp['nom_0'] = train_imp['nom_0'].fillna(train_imp['nom_0'].mode()[0])
train_imp['nom_1'] = train_imp['nom_1'].fillna(train_imp['nom_1'].mode()[0])
train_imp['nom_2'] = train_imp['nom_2'].fillna(train_imp['nom_2'].mode()[0])
train_imp['nom_3'] = train_imp['nom_3'].fillna(train_imp['nom_3'].mode()[0])
train_imp['nom_4'] = train_imp['nom_4'].fillna(train_imp['nom_4'].mode()[0])
train_imp['nom_5'] = train_imp['nom_5'].fillna(-1)
train_imp['nom_6'] = train_imp['nom_6'].fillna(-1)
train_imp['nom_7'] = train_imp['nom_7'].fillna(-1)
train_imp['nom_8'] = train_imp['nom_8'].fillna(-1)
train_imp['nom_9'] = train_imp['nom_9'].fillna(-1)

In [7]:
test_imp['nom_0'] = test_imp['nom_0'].fillna(test_imp['nom_0'].mode()[0])
test_imp['nom_1'] = test_imp['nom_1'].fillna(test_imp['nom_1'].mode()[0])
test_imp['nom_2'] = test_imp['nom_2'].fillna(test_imp['nom_2'].mode()[0])
test_imp['nom_3'] = test_imp['nom_3'].fillna(test_imp['nom_3'].mode()[0])
test_imp['nom_4'] = test_imp['nom_4'].fillna(test_imp['nom_4'].mode()[0])
test_imp['nom_5'] = test_imp['nom_5'].fillna(-1)
test_imp['nom_6'] = test_imp['nom_6'].fillna(-1)
test_imp['nom_7'] = test_imp['nom_7'].fillna(-1)
test_imp['nom_8'] = test_imp['nom_8'].fillna(-1)
test_imp['nom_9'] = test_imp['nom_9'].fillna(-1)

In [8]:
train_imp['ord_0'] = train_imp['ord_0'].fillna(train_imp['ord_0'].mode()[0])
train_imp['ord_1'] = train_imp['ord_1'].fillna(train_imp['ord_1'].mode()[0])
train_imp['ord_2'] = train_imp['ord_2'].fillna(train_imp['ord_2'].mode()[0])
train_imp['ord_3'] = train_imp['ord_3'].fillna('결측치')
train_imp['ord_4'] = train_imp['ord_4'].fillna('결측치')
train_imp['ord_5'] = train_imp['ord_5'].fillna('결측치')

In [9]:
test_imp['ord_0'] = test_imp['ord_0'].fillna(test_imp['ord_0'].mode()[0])
test_imp['ord_1'] = test_imp['ord_1'].fillna(test_imp['ord_1'].mode()[0])
test_imp['ord_2'] = test_imp['ord_2'].fillna(test_imp['ord_2'].mode()[0])
test_imp['ord_3'] = test_imp['ord_3'].fillna('결측치')
test_imp['ord_4'] = test_imp['ord_4'].fillna('결측치')
test_imp['ord_5'] = test_imp['ord_5'].fillna('결측치')

In [10]:
train_imp['day'] = train_imp['day'].fillna(train_imp['day'].mode()[0])
train_imp['month'] = train_imp['month'].fillna(train_imp['month'].mode()[0])

In [11]:
test_imp['day'] = test_imp['day'].fillna(test_imp['day'].mode()[0])
test_imp['month'] = test_imp['month'].fillna(test_imp['month'].mode()[0])

In [12]:
all_data = pd.concat([train_imp, test_imp], ignore_index = True)
all_data.drop(columns = ['target', 'id'], axis = 1, inplace = True)

train_target = train['target']
all_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,999985,999986,999987,999988,999989,999990,999991,999992,999993,999994,999995,999996,999997,999998,999999
bin_0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bin_1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
bin_2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
bin_3,F,F,F,F,T,T,F,T,F,F,T,F,T,F,F,...,F,F,F,F,T,F,T,T,F,F,T,F,T,F,F
bin_4,N,Y,N,N,N,N,N,N,N,Y,Y,Y,N,N,Y,...,N,Y,Y,N,Y,Y,Y,N,N,N,N,N,N,Y,N
nom_0,Red,Red,Red,Red,Red,Red,Red,Red,Blue,Red,Blue,Red,Red,Blue,Red,...,Blue,Red,Blue,Red,Red,Red,Red,Red,Red,Red,Red,Red,Red,Red,Blue
nom_1,Trapezoid,Star,Triangle,Circle,Triangle,Triangle,Triangle,Triangle,Polygon,Polygon,Triangle,Square,Circle,Trapezoid,Polygon,...,Polygon,Triangle,Trapezoid,Trapezoid,Circle,Polygon,Triangle,Polygon,Polygon,Triangle,Trapezoid,Polygon,Triangle,Circle,Circle
nom_2,Hamster,Axolotl,Hamster,Hamster,Hamster,Lion,Hamster,Axolotl,Hamster,Hamster,Hamster,Hamster,Axolotl,Lion,Hamster,...,Lion,Hamster,Hamster,Hamster,Hamster,Axolotl,Lion,Lion,Axolotl,Axolotl,Axolotl,Dog,Hamster,Dog,Lion
nom_3,Russia,India,Canada,Finland,Costa Rica,China,Costa Rica,Finland,Russia,Finland,Finland,Costa Rica,Russia,Costa Rica,Costa Rica,...,India,India,Costa Rica,Russia,Costa Rica,Costa Rica,India,China,China,Finland,Costa Rica,Russia,India,Costa Rica,Finland
nom_4,Bassoon,Theremin,Bassoon,Theremin,Theremin,Bassoon,Bassoon,Bassoon,Oboe,Theremin,Bassoon,Oboe,Theremin,Bassoon,Piano,...,Theremin,Theremin,Piano,Theremin,Bassoon,Theremin,Theremin,Theremin,Theremin,Theremin,Theremin,Theremin,Theremin,Theremin,Bassoon


In [13]:
train_new = all_data[:len(train)]
test_new  = all_data[len(train):]
print(train_new.shape, test_new.shape)

(600000, 23) (400000, 23)


In [14]:
enc = JamesSteinEncoder()
train_encoded = enc.fit_transform(train_new, train_target)
test_encoded = enc.transform(test_new)

In [15]:
train_target = train['target']

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_encoded, train_target, stratify = train_target, random_state = 99)

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state = 99)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

0.7864866991837377

In [18]:
sub = pd.read_csv('sample_submission.csv')

# 전체 데이터로 학습
model = LogisticRegression(random_state = 99)
model.fit(train_encoded, train_target)

# test data에 대해 예측 
y_pred = model.predict_proba(test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('LR_je.csv', index = False)

In [19]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth = 9, random_state = 99)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

0.7178325251325637

In [20]:
sub = pd.read_csv('sample_submission.csv')

# 전체 데이터로 학습
model = DecisionTreeClassifier(max_depth = 9, random_state = 99)
model.fit(train_encoded, train_target)

# test data에 대해 예측 
y_pred = model.predict_proba(test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('DT_je.csv', index = False)

In [21]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(10,10), random_state = 99)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

0.7922116081720548

In [22]:
sub = pd.read_csv('sample_submission.csv')

# 전체 데이터로 학습
model = MLPClassifier(hidden_layer_sizes=(10,10), random_state = 99)
model.fit(train_encoded, train_target)

# test data에 대해 예측 
y_pred = model.predict_proba(test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('MLP_je.csv', index = False)