In [1]:
!pip install category-encoders



In [2]:
!pip install scikit-learn --user --upgrade



In [3]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

pd.set_option("display.max_columns", 30)

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## 1. 데이터 점검

In [5]:
train

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,599995,0.0,1.0,0.0,T,N,Red,Polygon,Axolotl,India,Theremin,014770cf0,da5014b01,a7059911d,158183c63,015c63324,3.0,Novice,Freezing,a,R,GZ,5.0,,0
599996,599996,1.0,0.0,0.0,T,Y,Blue,Polygon,Dog,Costa Rica,Oboe,,2023ed4ed,83bdea3a5,e9fde8fa8,a02ae6a63,2.0,Novice,Boiling Hot,n,N,sf,,3.0,0
599997,599997,0.0,0.0,0.0,F,Y,Red,Circle,Axolotl,Russia,Theremin,c7dc5d460,5d7d341ac,114b1dbf3,cccbca824,40f9610c1,2.0,Contributor,Freezing,n,H,MV,7.0,5.0,0
599998,599998,1.0,1.0,0.0,F,Y,,Polygon,Axolotl,,Piano,4d7780407,209e1054e,fba315672,4164322bd,c1a8374a0,1.0,Master,Warm,m,X,Ey,1.0,5.0,0


In [6]:
train.shape, test.shape

((600000, 25), (400000, 24))

In [7]:
train.duplicated().sum()

0

In [8]:
print(train.isnull().sum(), '\n')
print(test.isnull().sum())

id            0
bin_0     17894
bin_1     18003
bin_2     17930
bin_3     18014
bin_4     18047
nom_0     18252
nom_1     18156
nom_2     18035
nom_3     18121
nom_4     18035
nom_5     17778
nom_6     18131
nom_7     18003
nom_8     17755
nom_9     18073
ord_0     18288
ord_1     18041
ord_2     18075
ord_3     17916
ord_4     17930
ord_5     17713
day       17952
month     17988
target        0
dtype: int64 

id           0
bin_0    11901
bin_1    12038
bin_2    11972
bin_3    11951
bin_4    11951
nom_0    12062
nom_1    11947
nom_2    12179
nom_3    12176
nom_4    11993
nom_5    11912
nom_6    12012
nom_7    12003
nom_8    11956
nom_9    12060
ord_0    11893
ord_1    12167
ord_2    12105
ord_3    12053
ord_4    11933
ord_5    12047
day      12025
month    11984
dtype: int64


## 2. 결측치 처리

#### 이진변수 결측치 처리

In [9]:
bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
nom1 = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
nom2 = ['nom_5','nom_6','nom_7','nom_8','nom_9']
ord = ['ord_0','ord_1','ord_2','ord_3','ord_4','ord_5']

In [10]:
from sklearn.impute import SimpleImputer 

imputer_bin = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imputer_bin.fit(train[bin])

In [11]:
train_imp = train.copy()
train_imp[bin] = imputer_bin.transform(train[bin])

In [12]:
test_imp = test.copy()
test_imp[bin] = imputer_bin.transform(test[bin])

#### 명목변수 결측치 처리

In [13]:
train_imp['nom_0'] = train_imp['nom_0'].fillna(train_imp['nom_0'].mode()[0])
train_imp['nom_1'] = train_imp['nom_1'].fillna(train_imp['nom_1'].mode()[0])
train_imp['nom_2'] = train_imp['nom_2'].fillna(train_imp['nom_2'].mode()[0])
train_imp['nom_3'] = train_imp['nom_3'].fillna(train_imp['nom_3'].mode()[0])
train_imp['nom_4'] = train_imp['nom_4'].fillna(train_imp['nom_4'].mode()[0])
train_imp['nom_5'] = train_imp['nom_5'].fillna(-1)
train_imp['nom_6'] = train_imp['nom_6'].fillna(-1)
train_imp['nom_7'] = train_imp['nom_7'].fillna(-1)
train_imp['nom_8'] = train_imp['nom_8'].fillna(-1)
train_imp['nom_9'] = train_imp['nom_9'].fillna(-1)

In [14]:
test_imp['nom_0'] = test_imp['nom_0'].fillna(test_imp['nom_0'].mode()[0])
test_imp['nom_1'] = test_imp['nom_1'].fillna(test_imp['nom_1'].mode()[0])
test_imp['nom_2'] = test_imp['nom_2'].fillna(test_imp['nom_2'].mode()[0])
test_imp['nom_3'] = test_imp['nom_3'].fillna(test_imp['nom_3'].mode()[0])
test_imp['nom_4'] = test_imp['nom_4'].fillna(test_imp['nom_4'].mode()[0])
test_imp['nom_5'] = test_imp['nom_5'].fillna(-1)
test_imp['nom_6'] = test_imp['nom_6'].fillna(-1)
test_imp['nom_7'] = test_imp['nom_7'].fillna(-1)
test_imp['nom_8'] = test_imp['nom_8'].fillna(-1)
test_imp['nom_9'] = test_imp['nom_9'].fillna(-1)

In [15]:
train_imp['month'].value_counts()

8.0     79245
3.0     70160
5.0     68906
12.0    68340
6.0     60478
7.0     53480
1.0     52154
11.0    51165
2.0     40700
9.0     20620
4.0     14614
10.0     2150
Name: month, dtype: int64

In [16]:
train_imp.loc[train[train['nom_6'].isna()].index]['nom_6']

26        -1
27        -1
69        -1
124       -1
215       -1
          ..
599918    -1
599941    -1
599947    -1
599963    -1
599973    -1
Name: nom_6, Length: 18131, dtype: object

#### 순서형 변수 결측치 처리

In [17]:
train_imp['ord_0'] = train_imp['ord_0'].fillna(train_imp['ord_0'].mode()[0])
train_imp['ord_1'] = train_imp['ord_1'].fillna(train_imp['ord_1'].mode()[0])
train_imp['ord_2'] = train_imp['ord_2'].fillna(train_imp['ord_2'].mode()[0])
train_imp['ord_3'] = train_imp['ord_3'].fillna('결측치')
train_imp['ord_4'] = train_imp['ord_4'].fillna('결측치')
train_imp['ord_5'] = train_imp['ord_5'].fillna('결측치')

In [18]:
test_imp['ord_0'] = test_imp['ord_0'].fillna(test_imp['ord_0'].mode()[0])
test_imp['ord_1'] = test_imp['ord_1'].fillna(test_imp['ord_1'].mode()[0])
test_imp['ord_2'] = test_imp['ord_2'].fillna(test_imp['ord_2'].mode()[0])
test_imp['ord_3'] = test_imp['ord_3'].fillna('결측치')
test_imp['ord_4'] = test_imp['ord_4'].fillna('결측치')
test_imp['ord_5'] = test_imp['ord_5'].fillna('결측치')

#### day, month 결측치 처리

In [19]:
train_imp['day'] = train_imp['day'].fillna(train_imp['day'].mode()[0])
train_imp['month'] = train_imp['month'].fillna(train_imp['month'].mode()[0])

In [20]:
test_imp['day'] = test_imp['day'].fillna(test_imp['day'].mode()[0])
test_imp['month'] = test_imp['month'].fillna(test_imp['month'].mode()[0])

In [21]:
print(train_imp.isnull().sum(), '\n')
print(test_imp.isnull().sum())

id        0
bin_0     0
bin_1     0
bin_2     0
bin_3     0
bin_4     0
nom_0     0
nom_1     0
nom_2     0
nom_3     0
nom_4     0
nom_5     0
nom_6     0
nom_7     0
nom_8     0
nom_9     0
ord_0     0
ord_1     0
ord_2     0
ord_3     0
ord_4     0
ord_5     0
day       0
month     0
target    0
dtype: int64 

id       0
bin_0    0
bin_1    0
bin_2    0
bin_3    0
bin_4    0
nom_0    0
nom_1    0
nom_2    0
nom_3    0
nom_4    0
nom_5    0
nom_6    0
nom_7    0
nom_8    0
nom_9    0
ord_0    0
ord_1    0
ord_2    0
ord_3    0
ord_4    0
ord_5    0
day      0
month    0
dtype: int64


## 인코딩

In [22]:
all_data = pd.concat([train_imp, test_imp], ignore_index = True)
all_data.drop(columns = ['target', 'id'], axis = 1, inplace = True)

train_target = train['target']
all_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,999985,999986,999987,999988,999989,999990,999991,999992,999993,999994,999995,999996,999997,999998,999999
bin_0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bin_1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
bin_2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
bin_3,F,F,F,F,T,T,F,T,F,F,T,F,T,F,F,...,F,F,F,F,T,F,T,T,F,F,T,F,T,F,F
bin_4,N,Y,N,N,N,N,N,N,N,Y,Y,Y,N,N,Y,...,N,Y,Y,N,Y,Y,Y,N,N,N,N,N,N,Y,N
nom_0,Red,Red,Red,Red,Red,Red,Red,Red,Blue,Red,Blue,Red,Red,Blue,Red,...,Blue,Red,Blue,Red,Red,Red,Red,Red,Red,Red,Red,Red,Red,Red,Blue
nom_1,Trapezoid,Star,Triangle,Circle,Triangle,Triangle,Triangle,Triangle,Polygon,Polygon,Triangle,Square,Circle,Trapezoid,Polygon,...,Polygon,Triangle,Trapezoid,Trapezoid,Circle,Polygon,Triangle,Polygon,Polygon,Triangle,Trapezoid,Polygon,Triangle,Circle,Circle
nom_2,Hamster,Axolotl,Hamster,Hamster,Hamster,Lion,Hamster,Axolotl,Hamster,Hamster,Hamster,Hamster,Axolotl,Lion,Hamster,...,Lion,Hamster,Hamster,Hamster,Hamster,Axolotl,Lion,Lion,Axolotl,Axolotl,Axolotl,Dog,Hamster,Dog,Lion
nom_3,Russia,India,Canada,Finland,Costa Rica,China,Costa Rica,Finland,Russia,Finland,Finland,Costa Rica,Russia,Costa Rica,Costa Rica,...,India,India,Costa Rica,Russia,Costa Rica,Costa Rica,India,China,China,Finland,Costa Rica,Russia,India,Costa Rica,Finland
nom_4,Bassoon,Theremin,Bassoon,Theremin,Theremin,Bassoon,Bassoon,Bassoon,Oboe,Theremin,Bassoon,Oboe,Theremin,Bassoon,Piano,...,Theremin,Theremin,Piano,Theremin,Bassoon,Theremin,Theremin,Theremin,Theremin,Theremin,Theremin,Theremin,Theremin,Theremin,Bassoon


#### 이진변수 인코딩

In [23]:
all_data['bin_3'] = all_data['bin_3'].apply(lambda x: 1 if x == 'T' else 0)
all_data['bin_4'] = all_data['bin_4'].apply(lambda x: 1 if x == 'Y' else 0)

bin_cols = [f'bin_{i}' for i in range(0, 5)]
bin_data = all_data[bin_cols]
bin_data.head(3)

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4
0,0.0,0.0,0.0,0,0
1,1.0,1.0,0.0,0,1
2,0.0,1.0,0.0,0,0


#### 순서형 변수 인코딩

In [24]:
# 순서 설정이 필요한 ord_1, ord_2를 숫자값 순서로 설정해준다.
ord_cols = [f'ord_{i}' for i in range(0, 6)] 
ord_data = all_data[ord_cols]

ord1_enc = {'Novice': 0, 'Contributor': 1, 'Expert': 2,'Master': 3, 'Grandmaster': 4}
ord2_enc = {'Freezing': 0 , 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5}

ord_data.loc[:, 'ord_1'] = ord_data.loc[:, 'ord_1'].map(ord1_enc)
ord_data.loc[:, 'ord_2'] = ord_data.loc[:, 'ord_2'].map(ord2_enc)
ord_data

Unnamed: 0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5
0,3.0,1,3,c,U,Pw
1,3.0,4,2,e,X,pE
2,3.0,0,0,n,P,eN
3,1.0,0,5,a,C,결측치
4,3.0,4,1,h,C,OZ
...,...,...,...,...,...,...
999995,2.0,2,2,n,R,dp
999996,1.0,1,0,n,X,US
999997,1.0,4,4,m,P,TL
999998,3.0,0,5,h,L,DI


In [25]:
from sklearn.preprocessing import LabelEncoder

for i in ['ord_3','ord_4','ord_5']:
    enc = LabelEncoder()
    ord_data[i] = enc.fit_transform(ord_data[i])
ord_data

Unnamed: 0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5
0,3.0,1,3,2,20,56
1,3.0,4,2,4,23,150
2,3.0,0,0,13,15,105
3,1.0,0,5,0,2,190
4,3.0,4,1,7,2,50
...,...,...,...,...,...,...
999995,2.0,2,2,13,17,103
999996,1.0,1,0,13,23,79
999997,1.0,4,4,12,15,73
999998,3.0,0,5,7,11,11


In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ord_data_scaled = scaler.fit_transform(ord_data)
pd.DataFrame(ord_data_scaled, columns = ord_data.columns)

Unnamed: 0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5
0,1.260614,-0.469273,0.576332,-1.011083,0.932590,-0.665977
1,1.260614,1.636256,-0.012119,-0.611338,1.317413,0.983216
2,1.260614,-1.171116,-1.189021,1.187513,0.291219,0.193709
3,-1.074598,-1.171116,1.753234,-1.410828,-1.376345,1.685001
4,1.260614,1.636256,-0.600570,-0.011721,-1.376345,-0.771245
...,...,...,...,...,...,...
999995,0.093008,0.232570,-0.012119,1.187513,0.547767,0.158620
999996,-1.074598,-0.469273,-1.189021,1.187513,1.317413,-0.262451
999997,-1.074598,1.636256,1.164783,0.987640,0.291219,-0.367719
999998,1.260614,-1.171116,1.753234,-0.011721,-0.221878,-1.455485


In [27]:
all_data2 = pd.concat([bin_data, pd.DataFrame(ord_data_scaled, columns = ord_data.columns)], axis = 1)

In [28]:
nom_cols = [f'nom_{i}' for i in range(0, 10)] 
nm_list = ['day', 'month']
nom_data = all_data[nom_cols]
nom_data

Unnamed: 0,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9
0,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990
1,Red,Star,Axolotl,India,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af
2,Red,Triangle,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,-1
3,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57
4,Red,Triangle,Hamster,Costa Rica,Theremin,777d1ac2c,3a7975e46,bc9cc2a94,-1,c5361037c
...,...,...,...,...,...,...,...,...,...,...
999995,Red,Trapezoid,Axolotl,Costa Rica,Theremin,92fb4c0dc,633bdcfd0,3ef3018d3,c1a4acfaf,45a68dd2e
999996,Red,Polygon,Dog,Russia,Theremin,8bd03e713,7c241cd40,f74f0b894,220190c9e,e48348d66
999997,Red,Triangle,Hamster,India,Theremin,6924d999b,18d43aee8,3afd3697d,cc5495ab3,47021df0d
999998,Red,Circle,Dog,Costa Rica,Theremin,3e0230528,0c073adc7,3600c6e91,32b33a4b4,e4bf32721


In [29]:
nom_data_dum = pd.get_dummies(nom_data)
nom_data_dum

Unnamed: 0,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_2_Axolotl,nom_2_Cat,nom_2_Dog,nom_2_Hamster,nom_2_Lion,nom_2_Snake,...,nom_9_fdd612aab,nom_9_fdf9297f6,nom_9_fe36e1545,nom_9_fe5312a7a,nom_9_fe655379a,nom_9_fe7e11d45,nom_9_fe7fa8831,nom_9_fe9bdeef3,nom_9_fecb6bcc3,nom_9_fee724acc,nom_9_ff1288133,nom_9_ff12eee03,nom_9_ff412d38f,nom_9_ff4a11902,nom_9_ff4a11ad3
0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
999996,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
999997,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
999998,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
daymonth_col = ['day', 'month']
daymonth_data = all_data[daymonth_col]

scaler = StandardScaler()
daymonth_scaled = scaler.fit_transform(daymonth_data)
pd.DataFrame(daymonth_scaled, columns = daymonth_data.columns)

Unnamed: 0,day,month
0,0.954750,-1.000814
1,1.451572,0.169895
2,0.457929,0.755249
3,-0.535714,-1.000814
4,0.457929,1.633281
...,...,...
999995,-1.032535,1.633281
999996,-1.529357,-0.415460
999997,-1.529357,-0.122782
999998,-1.032535,1.340604


In [31]:
all_data3 = pd.concat([all_data2, nom_data_dum, pd.DataFrame(daymonth_scaled, columns = daymonth_data.columns)], axis = 1)
all_data3

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,...,nom_9_fe36e1545,nom_9_fe5312a7a,nom_9_fe655379a,nom_9_fe7e11d45,nom_9_fe7fa8831,nom_9_fe9bdeef3,nom_9_fecb6bcc3,nom_9_fee724acc,nom_9_ff1288133,nom_9_ff12eee03,nom_9_ff412d38f,nom_9_ff4a11902,nom_9_ff4a11ad3,day,month
0,0.0,0.0,0.0,0,0,1.260614,-0.469273,0.576332,-1.011083,0.932590,-0.665977,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0.954750,-1.000814
1,1.0,1.0,0.0,0,1,1.260614,1.636256,-0.012119,-0.611338,1.317413,0.983216,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1.451572,0.169895
2,0.0,1.0,0.0,0,0,1.260614,-1.171116,-1.189021,1.187513,0.291219,0.193709,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0.457929,0.755249
3,0.0,0.0,0.0,0,0,-1.074598,-1.171116,1.753234,-1.410828,-1.376345,1.685001,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.535714,-1.000814
4,0.0,0.0,0.0,1,0,1.260614,1.636256,-0.600570,-0.011721,-1.376345,-0.771245,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0.457929,1.633281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.0,0.0,1.0,1,0,0.093008,0.232570,-0.012119,1.187513,0.547767,0.158620,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,-1.032535,1.633281
999996,0.0,0.0,0.0,0,0,-1.074598,-0.469273,-1.189021,1.187513,1.317413,-0.262451,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,-1.529357,-0.415460
999997,0.0,0.0,1.0,1,0,-1.074598,1.636256,1.164783,0.987640,0.291219,-0.367719,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,-1.529357,-0.122782
999998,0.0,0.0,0.0,0,1,1.260614,-1.171116,1.753234,-0.011721,-0.221878,-1.455485,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,-1.032535,1.340604


In [32]:
train_encoded = all_data3[:120000]
test_encoded  = all_data3[len(train):]
print(train_encoded.shape, test_encoded.shape)

(120000, 5445) (400000, 5445)


In [33]:
train_target_new = train_target[:120000]
print(train_target_new.shape, train_encoded.shape)

(120000,) (120000, 5445)


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_encoded, train_target_new, stratify = train_target_new, random_state = 99)
print(X_train.shape, X_val.shape)

(90000, 5445) (30000, 5445)


In [35]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state = 99)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

0.7565911365480866

In [36]:
sub = pd.read_csv('sample_submission.csv')

# 전체 데이터로 학습
model = LogisticRegression(random_state = 99)
model.fit(train_encoded, train_target_new)

# test data에 대해 예측 
y_pred = model.predict_proba(test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('LR_mix2.csv', index = False)

In [37]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth = 9, random_state = 99)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

0.6920976661745767

In [38]:
sub = pd.read_csv('sample_submission.csv')

# 전체 데이터로 학습
model = DecisionTreeClassifier(max_depth = 9, random_state = 99)
model.fit(train_encoded, train_target_new)

# test data에 대해 예측 
y_pred = model.predict_proba(test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('DT_mix2.csv', index = False)

In [39]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state = 99)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

0.6961376899492637

In [40]:
sub = pd.read_csv('sample_submission.csv')

# 전체 데이터로 학습
model = MLPClassifier(random_state = 99)
model.fit(train_encoded, train_target_new)

# test data에 대해 예측 
y_pred = model.predict_proba(test_encoded)
del sub['target']
sub['target'] = y_pred[:, 1]

sub.to_csv('MLP_mix2.csv', index = False)