In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(42)

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score

# 1. 데이터 로드

In [2]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test_x.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [None]:
train.shape, test.shape, submission.shape
train.head(3)
test.head(3)
submission.head(3)

# 2. EDA & 전처리

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
train.info

## Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

## Target 분포

In [None]:
f, ax = plt.subplots(1, 2, figsize=(10, 4))

train['voted'].value_counts().plot.pie(autopct='%1.1f%%', ax=ax[0], shadow=True)
sns.countplot('voted', data=train, ax=ax[1])

plt.show()

# 균일

## 컬럼 분류

In [3]:
data = []

for f in train.columns:
    dtl = ''
    if f == 'voted':
        gubun = 'target'
    elif 'Q' in f:
        dtl = f[1]
        if 'A' in f:
            gubun = 'Q_A'
        if 'E' in f:
            gubun = 'Q_E'
    elif 'tp' in f:           
        gubun = 'tp'            
    elif 'wf' in f:           
        gubun = 'wf'
    elif 'wr' in f:           
        gubun = 'wr'    
    else:
        gubun = 'cat'
        
    f_dict = {
        'col':f,
        'gbn':gubun,
        'dtl':dtl
    }
            
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns =['col', 'gbn', 'dtl'])
meta.set_index('col', inplace=True)
meta

Unnamed: 0_level_0,gbn,dtl
col,Unnamed: 1_level_1,Unnamed: 2_level_1
QaA,Q_A,a
QaE,Q_E,a
QbA,Q_A,b
QbE,Q_E,b
QcA,Q_A,c
...,...,...
wr_09,wr,
wr_10,wr,
wr_11,wr,
wr_12,wr,


### Q_A

In [4]:
Q_A = meta[meta.gbn=='Q_A'].index
train[Q_A].describe()
train[Q_A].head(3)

Unnamed: 0,QaA,QbA,QcA,QdA,QeA,QfA,QgA,QhA,QiA,QjA,QkA,QlA,QmA,QnA,QoA,QpA,QqA,QrA,QsA,QtA
count,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0
mean,2.129535,2.904463,3.662347,1.749078,2.317952,2.168145,2.560595,2.317952,3.157691,3.575529,3.813933,4.325727,3.50962,2.632896,3.217232,3.071752,2.840464,2.823487,3.432948,3.225687
std,1.196952,1.566142,1.431494,1.043625,1.369205,1.348653,1.5106,1.460813,1.48174,1.36122,1.372551,1.146992,1.375134,1.473022,1.476414,1.489744,1.404342,1.424383,1.374938,1.521603
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,2.0,3.0,4.0,1.0,2.0,2.0,2.0,2.0,3.0,4.0,4.0,5.0,4.0,2.0,4.0,3.0,3.0,3.0,4.0,4.0
75%,3.0,4.0,5.0,2.0,3.0,3.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


Unnamed: 0_level_0,QaA,QbA,QcA,QdA,QeA,QfA,QgA,QhA,QiA,QjA,QkA,QlA,QmA,QnA,QoA,QpA,QqA,QrA,QsA,QtA
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,3.0,4.0,5.0,1.0,2.0,5.0,2.0,4.0,5.0,4.0,4.0,4.0,2.0,5.0,2.0,2.0,2.0,2.0,2.0,1.0
1,5.0,5.0,3.0,5.0,1.0,3.0,1.0,1.0,5.0,3.0,5.0,4.0,1.0,5.0,1.0,5.0,5.0,1.0,4.0,1.0
2,4.0,1.0,1.0,4.0,5.0,1.0,4.0,1.0,3.0,2.0,5.0,5.0,2.0,4.0,2.0,1.0,5.0,4.0,1.0,1.0


### Q_A 1/5 count

In [5]:
# Q_A: 1~5 1/5 극단값 선택한 count 추출
train['Q1'] = 0
train['Q3'] = 0
train['Q5'] = 0

test['Q1'] = 0
test['Q3'] = 0
test['Q5'] = 0

In [9]:
%%time
for f in Q_A:
    for i, v in enumerate(train[f]):
        if v == 1:
            train.loc[i, 'Q1'] = train.loc[i, 'Q1']+1
        elif v == 3:
            train.loc[i, 'Q3'] = train.loc[i, 'Q3']+1
        elif v == 5:
            train.loc[i, 'Q5'] = train.loc[i, 'Q5']+1            

Wall time: 2min 55s


In [10]:
%%time
for f in Q_A:
    for i, v in enumerate(test[f]):
        if v == 1:
            test.loc[i, 'Q1'] = test.loc[i, 'Q1']+1
        elif v == 3:
            test.loc[i, 'Q3'] = test.loc[i, 'Q3']+1            
        elif v == 5:
            test.loc[i, 'Q5'] = test.loc[i, 'Q5']+1                        

Wall time: 39 s


In [None]:
colQ_As = Q_A.tolist()
colQ_As.extend(['Q1','Q3','Q5', 'voted'])

corr_Q_A = train[colQ_As].corr()

mask = np.zeros_like(corr_Q_A, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(18, 12))
plt.title('colQ_As', fontsize=18)

sns.heatmap(corr_Q_A, mask=mask, annot=True,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':10})
plt.show()

In [None]:
corr_Q_A['voted'].abs().sort_values(ascending=False)

### Q_E

In [6]:
Q_E = meta[meta.gbn == 'Q_E'].index
train[Q_E].describe()
train[Q_E].head()

Unnamed: 0,QaE,QbE,QcE,QdE,QeE,QfE,QgE,QhE,QiE,QjE,QkE,QlE,QmE,QnE,QoE,QpE,QqE,QrE,QsE,QtE
count,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0
mean,945.357,2189.589,1484.294518,1490.672,1899.292,1850.65,1284.608,4584.272,4068.434,2576.363,2038.986,3741.018,2050.287,959.880502,1435.762,1633.479,1154.352,2470.808,1164.117,2240.057
std,13075.65,33510.27,8977.664318,10922.6,16707.65,76082.36,9756.584,502026.5,114173.8,54114.32,30691.64,366367.2,75307.87,5869.964127,10849.97,26002.21,22506.89,148289.0,7433.515,67309.24
min,25.0,25.0,25.0,26.0,25.0,25.0,25.0,0.0,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,25.0,25.0,25.0
25%,404.0,875.0,651.0,679.0,834.0,504.0,549.0,1077.0,1259.0,902.0,859.0,806.0,650.0,428.0,638.0,706.0,461.0,794.0,527.0,733.0
50%,557.0,1218.0,899.0,931.0,1154.0,712.0,772.0,1464.0,1800.0,1260.0,1178.0,1125.0,906.0,596.0,883.0,970.0,652.0,1117.0,747.0,1020.0
75%,827.0,1838.0,1335.0,1355.0,1656.0,1078.0,1150.0,2090.0,2761.0,1899.0,1721.0,1649.0,1342.0,891.0,1300.0,1409.0,982.0,1664.0,1124.0,1526.0
max,2413960.0,5580395.0,871557.0,1552821.0,1919926.0,11763700.0,1068252.0,107086000.0,17035860.0,9910006.0,4824953.0,77543510.0,14108320.0,606694.0,1309739.0,4650742.0,4277510.0,31593280.0,1286581.0,11556500.0


Unnamed: 0_level_0,QaE,QbE,QcE,QdE,QeE,QfE,QgE,QhE,QiE,QjE,QkE,QlE,QmE,QnE,QoE,QpE,QqE,QrE,QsE,QtE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,363,1370,997,1024,1577,539,586,1095,1142,1287,883,851,851,816,579,924,366,876,633,1115
1,647,1313,3387,2969,4320,2190,826,4082,1867,1264,2943,3927,4329,1828,1214,2414,1356,3039,4304,1346
2,1623,1480,1021,3374,1333,531,1167,1016,2653,1569,998,2547,918,2153,1304,1131,937,1327,1170,1409
3,504,2311,992,3245,357,1519,159,2275,2809,5614,3219,1296,9046,1216,1169,23868,581,8830,2392,1312
4,927,707,556,1062,1014,628,991,1259,1153,1388,740,1181,547,575,754,1140,323,1070,583,1889


In [None]:
for f in Q_E:
    g = sns.FacetGrid(train, col='voted').map(sns.distplot, f)

In [35]:
for f in Q_E:
    print(f, train[f].min(), train[f].max())

QaE 3.258096538021482 14.69677952475103
QbE 3.258096538021482 15.534770299581337
QcE 3.258096538021482 13.678037693649705
QdE 3.295836866004329 14.255584478691235
QeE 3.258096538021482 14.467797722447822
QfE 3.258096538021482 16.280529246815284
QgE 3.258096538021482 13.88153516183371
QhE 0.0 18.4891429386553
QiE 0.0 16.650831150799657
QjE 0.0 16.109055612663138
QkE 0.0 15.38931175903102
QlE 3.258096538021482 18.166349781933498
QmE 3.258096538021482 16.462275323127148
QnE 3.258096538021482 13.315781472591008
QoE 0.0 14.085339202196463
QpE 0.0 15.35253754974552
QqE 0.0 15.268881856340737
QrE 3.258096538021482 17.268455155975815
QsE 3.258096538021482 14.067499647498977
QtE 3.258096538021482 16.262759039855734


In [None]:
colQ_Es = Q_E.tolist()
colQ_Es.extend(['voted'])

corr_Q_E = train[colQ_Es].corr()

mask = np.zeros_like(corr_Q_E, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(18, 12))
plt.title('colQ_ES', fontsize=18)

sns.heatmap(corr_Q_E, mask=mask, annot=True,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':10})
plt.show()

#### skew

In [7]:
for f in Q_E:
    train[f] = np.log1p(train[f])
    test[f] = np.log1p(test[f])

### tp

In [8]:
# 0~7 
tp = meta[meta.gbn=='tp'].index
train[tp].describe()
train[tp].head(3)

Unnamed: 0,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10
count,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0
mean,3.017241,2.621189,1.768712,3.041246,1.335918,2.472217,2.031099,3.507226,2.277497,4.317996
std,2.007713,1.897081,1.685069,2.037278,1.452544,2.000658,1.785824,1.971456,1.848783,1.722793
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,1.0,0.0,1.0,1.0,2.0,1.0,3.0
50%,3.0,2.0,1.0,3.0,1.0,2.0,2.0,4.0,2.0,5.0
75%,5.0,4.0,3.0,5.0,2.0,4.0,3.0,5.0,4.0,6.0
max,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


Unnamed: 0_level_0,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,2,2,1,2,1,7,4,4,3
1,1,1,0,0,1,2,3,4,0,4
2,2,3,1,5,3,4,2,6,1,3


In [None]:
for f in tp:
    g = sns.FacetGrid(train, col='voted').map(sns.distplot, f)

In [None]:
col_tps = tp.tolist()
col_tps.extend(['voted'])

corr_tp = train[col_tps].corr()

mask = np.zeros_like(corr_tp, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(18, 12))
plt.title('col_tpS', fontsize=18)

sns.heatmap(corr_tp, mask=mask, annot=True,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':10})
plt.show()

In [11]:
# tp: 1,7 극단값 선택한 count 추출, 4 중간값 선택 count 
train['tp17'] = 0
train['tp4'] = 0

test['tp17'] = 0
test['tp4'] = 0

In [12]:
%%time
for f in tp:
    for i, v in enumerate(train[f]):
        if (v == 1 | v == 7):
            train.loc[i, 'tp17'] = train.loc[i, 'tp17']+1
        if v == 4:
            train.loc[i, 'tp4'] = train.loc[i, 'tp4']+1            

Wall time: 20.3 s


In [13]:
%%time
for f in tp:
    for i, v in enumerate(test[f]):
        if (v == 1 | v == 7):
            test.loc[i, 'tp17'] = test.loc[i, 'tp17']+1
        if v == 4:
            test.loc[i, 'tp4'] = test.loc[i, 'tp4']+1            

Wall time: 3.81 s


#### 극단값 중간값

In [14]:
train.columns.values

array(['QaA', 'QaE', 'QbA', 'QbE', 'QcA', 'QcE', 'QdA', 'QdE', 'QeA',
       'QeE', 'QfA', 'QfE', 'QgA', 'QgE', 'QhA', 'QhE', 'QiA', 'QiE',
       'QjA', 'QjE', 'QkA', 'QkE', 'QlA', 'QlE', 'QmA', 'QmE', 'QnA',
       'QnE', 'QoA', 'QoE', 'QpA', 'QpE', 'QqA', 'QqE', 'QrA', 'QrE',
       'QsA', 'QsE', 'QtA', 'QtE', 'age_group', 'education', 'engnat',
       'familysize', 'gender', 'hand', 'married', 'race', 'religion',
       'tp01', 'tp02', 'tp03', 'tp04', 'tp05', 'tp06', 'tp07', 'tp08',
       'tp09', 'tp10', 'urban', 'voted', 'wf_01', 'wf_02', 'wf_03',
       'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06', 'wr_07',
       'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13', 'Q1', 'Q3',
       'Q5', 'tp17', 'tp4'], dtype=object)

In [15]:
train['QA15tp17'] = train['Q1'] + train['Q5'] + train['tp17']
train['QA3tp4'] = train['Q3'] + train['tp4']

In [16]:
test['QA15tp17'] = test['Q1'] + test['Q5'] + test['tp17']
test['QA3tp4'] = test['Q3'] + test['tp4']

In [None]:
meta.gbn.unique()

In [None]:
# 0/1
wf = meta[meta.gbn=='wf'].index
train[wf].describe()
train[wf].head(3)

In [None]:
# 0/1
wr = meta[meta.gbn=='wr'].index
train[wr].describe()
train[wr].head(3)

In [None]:
corr = train.corr()
corr['voted'].abs().sort_values(ascending=False)[:20]
corr['voted'].abs().sort_values(ascending=False)[60:]

## Target Encoding

### age_group

In [18]:
train['age_group_int'] = train['age_group'].str.replace('s','').astype('int')
test['age_group_int'] = test['age_group'].str.replace('s','').astype('int')

In [19]:
train['age_group_n_rows'] = train['age_group_int'].map(train.groupby('age_group_int').size())
test['age_group_n_rows'] = test['age_group_int'].map(test.groupby('age_group_int').size())

In [20]:
age_group_mean = train.groupby('age_group_int')['voted'].mean()
train['age_group_mean'] = train['age_group_int'].map(age_group_mean)
test['age_group_mean'] = test['age_group_int'].map(age_group_mean)

In [21]:
# train['voted'].mean() = 1.5468242115435298
def smoothing(n_rows, target_mean): 
    return (target_mean*n_rows + 1.5468242115435298*0.8) / (n_rows + 0.8)

In [22]:
train['age_group_mean_smoothing'] = train.apply(lambda x:smoothing(x['age_group_n_rows'], x['age_group_mean']), axis=1)
test['age_group_mean_smoothing'] = test.apply(lambda x:smoothing(x['age_group_n_rows'], x['age_group_mean']), axis=1)

In [23]:
train.drop(['age_group_int','age_group_mean', 'age_group_n_rows'], axis=1, inplace=True)
test.drop(['age_group_int','age_group_mean', 'age_group_n_rows'], axis=1, inplace=True)

## Outlier

In [None]:
test.familysize.max(), test.familysize.mean(), train[train.familysize < 30]['familysize'].mean()

In [None]:
train[train.familysize > 20][['voted', 'familysize']]

In [24]:
train.loc[train[train.familysize > 25].index, 'familysize'] = 2.6

In [None]:
train['familysize'].value_counts()

## 인코딩

In [None]:
#age_group, gender, race, religion
#train['age_group'] = train['age_group'].str.replace('s','').astype('int')
train['gender']= pd.factorize(train['gender'])[0]
train['race']=pd.factorize(train['race'])[0]
train['religion']=pd.factorize(train['religion'])[0]

#test['age_group'] = test['age_group'].str.replace('s','').astype('int')
test['gender']=pd.factorize(test['gender'])[0]
test['race']=pd.factorize(test['race'])[0]
test['religion']=pd.factorize(test['religion'])[0]

### One-hot

In [None]:
train = pd.get_dummies(train, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
train

In [None]:
test = pd.get_dummies(test, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
test

In [None]:
train.shape, test.shape
train.columns.values
test.columns.values

# 3. 모델 학습

In [25]:
from pycaret.classification import *

In [26]:
import gc
gc.collect()

32

In [27]:
col_cat = meta[(meta.gbn == 'Q_A') | (meta.gbn == 'cat')].index.values.tolist()
col_cat

['QaA',
 'QbA',
 'QcA',
 'QdA',
 'QeA',
 'QfA',
 'QgA',
 'QhA',
 'QiA',
 'QjA',
 'QkA',
 'QlA',
 'QmA',
 'QnA',
 'QoA',
 'QpA',
 'QqA',
 'QrA',
 'QsA',
 'QtA',
 'age_group',
 'education',
 'engnat',
 'familysize',
 'gender',
 'hand',
 'married',
 'race',
 'religion',
 'urban']

In [29]:
col_numeric = ['Q1','Q3','Q5', 'tp17','tp4', 'QA15tp17', 'QA3tp4']
col_numeric

['Q1', 'Q3', 'Q5', 'tp17', 'tp4', 'QA15tp17', 'QA3tp4']

In [None]:
#col_ignore = meta[(meta.gbn == 'wf') | (meta.gbn == 'wr')].index.values.tolist()
#col_ignore = meta[(meta.gbn == 'Q_E')].index.values.tolist()
col_ignore = []

In [30]:
%%time
clf = setup(session_id=42, 
            data = train, target = 'voted', polynomial_features=True
          , categorical_features=col_cat
            ,numeric_features=col_numeric
            #,ignore_features =col_ignore
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45532, 85)"
4,Missing Values,False
5,Numeric Features,28
6,Categorical Features,56
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 1min 29s


In [32]:
#best_3 = compare_models(sort = 'AUC', n_select = 3)
best_5 = compare_models(sort = 'AUC', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.6954,0.7645,0.6406,0.7642,0.6969,0.3955,0.4017,38.7393
1,CatBoost Classifier,0.6932,0.7644,0.6566,0.7511,0.7006,0.389,0.3927,25.4106
2,Light Gradient Boosting Machine,0.6923,0.7633,0.6443,0.7569,0.696,0.3886,0.3938,1.7709
3,Linear Discriminant Analysis,0.6906,0.7617,0.6636,0.7431,0.7011,0.3826,0.3852,1.6722
4,Logistic Regression,0.6883,0.7571,0.6518,0.7462,0.6957,0.3792,0.3829,1.6083
5,Ada Boost Classifier,0.6899,0.7557,0.6504,0.7494,0.6964,0.3828,0.3868,9.1272
6,Extra Trees Classifier,0.6862,0.7556,0.6326,0.7541,0.6879,0.3773,0.3833,5.1947
7,Extreme Gradient Boosting,0.6754,0.7442,0.6687,0.7183,0.6925,0.3496,0.3507,21.2262
8,Naive Bayes,0.6638,0.7198,0.6969,0.6912,0.6937,0.3212,0.3215,0.2311
9,Random Forest Classifier,0.6531,0.7086,0.6028,0.7177,0.6551,0.3115,0.3164,0.5429


In [33]:
blended = blend_models(estimator_list = best_5, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6962,0.7732,0.6609,0.7532,0.704,0.3946,0.3982
1,0.683,0.7562,0.6282,0.7513,0.6843,0.371,0.377
2,0.6989,0.767,0.6681,0.7535,0.7082,0.3996,0.4027
3,0.6917,0.7623,0.6462,0.7547,0.6962,0.3871,0.3919
4,0.6956,0.7731,0.6617,0.7519,0.7039,0.3934,0.3968
Mean,0.6931,0.7664,0.653,0.7529,0.6993,0.3891,0.3933
SD,0.0056,0.0065,0.0143,0.0012,0.0085,0.0099,0.0088


In [34]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.6956,0.7689,0.6608,0.7524,0.7036,0.3935,0.397


In [36]:
final_model = finalize_model(blended)

In [37]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [38]:
submission['voted'] = predictions['Score']

# 5. 제출

In [39]:
submission.to_csv('output/20200930-2-X_QA15tp17QA3tp4.csv')