In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(42)

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score

# 1. 데이터 로드

In [2]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test_x.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [None]:
train.shape, test.shape, submission.shape
train.head(3)
test.head(3)
submission.head(3)

# 2. EDA & 전처리

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
train.info

## Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

## Target 분포

In [None]:
f, ax = plt.subplots(1, 2, figsize=(10, 4))

train['voted'].value_counts().plot.pie(autopct='%1.1f%%', ax=ax[0], shadow=True)
sns.countplot('voted', data=train, ax=ax[1])

plt.show()

# 균일

## 컬럼 분류

In [3]:
data = []

for f in train.columns:
    dtl = ''
    if f == 'voted':
        gubun = 'target'
    elif 'Q' in f:
        dtl = f[1]
        if 'A' in f:
            gubun = 'Q_A'
        if 'E' in f:
            gubun = 'Q_E'
    elif 'tp' in f:           
        gubun = 'tp'            
    elif 'wf' in f:           
        gubun = 'wf'
    elif 'wr' in f:           
        gubun = 'wr'    
    else:
        gubun = 'cat'
        
    f_dict = {
        'col':f,
        'gbn':gubun,
        'dtl':dtl
    }
            
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns =['col', 'gbn', 'dtl'])
meta.set_index('col', inplace=True)
meta

Unnamed: 0_level_0,gbn,dtl
col,Unnamed: 1_level_1,Unnamed: 2_level_1
QaA,Q_A,a
QaE,Q_E,a
QbA,Q_A,b
QbE,Q_E,b
QcA,Q_A,c
...,...,...
wr_09,wr,
wr_10,wr,
wr_11,wr,
wr_12,wr,


### Q_A

In [None]:
Q_A = meta[meta.gbn=='Q_A'].index
train[Q_A].describe()
train[Q_A].head(3)

### Q_A 1/5 count

In [None]:
# Q_A: 1~5 1/5 극단값 선택한 count 추출
train['Q1'] = 0
train['Q3'] = 0
train['Q5'] = 0

test['Q1'] = 0
test['Q3'] = 0
test['Q5'] = 0

In [None]:
%%time
for f in Q_A:
    for i, v in enumerate(train[f]):
        if v == 1:
            train.loc[i, 'Q1'] = train.loc[i, 'Q1']+1
        elif v == 3:
            train.loc[i, 'Q3'] = train.loc[i, 'Q3']+1
        elif v == 5:
            train.loc[i, 'Q5'] = train.loc[i, 'Q5']+1            

In [None]:
%%time
for f in Q_A:
    for i, v in enumerate(test[f]):
        if v == 1:
            test.loc[i, 'Q1'] = test.loc[i, 'Q1']+1
        elif v == 3:
            test.loc[i, 'Q3'] = test.loc[i, 'Q3']+1            
        elif v == 5:
            test.loc[i, 'Q5'] = test.loc[i, 'Q5']+1                        

In [None]:
colQ_As = Q_A.tolist()
colQ_As.extend(['Q1','Q3','Q5', 'voted'])

corr_Q_A = train[colQ_As].corr()

mask = np.zeros_like(corr_Q_A, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(18, 12))
plt.title('colQ_As', fontsize=18)

sns.heatmap(corr_Q_A, mask=mask, annot=True,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':10})
plt.show()

In [None]:
corr_Q_A['voted'].abs().sort_values(ascending=False)

### Q_E

In [None]:
Q_E = meta[meta.gbn == 'Q_E'].index
train[Q_E].describe()
train[Q_E].head()

In [None]:
for f in Q_E:
    g = sns.FacetGrid(train, col='voted').map(sns.distplot, f)

In [None]:
for f in Q_E:
    print(f, train[f].min(), train[f].max())

In [None]:
colQ_Es = Q_E.tolist()
colQ_Es.extend(['voted'])

corr_Q_E = train[colQ_Es].corr()

mask = np.zeros_like(corr_Q_E, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(18, 12))
plt.title('colQ_ES', fontsize=18)

sns.heatmap(corr_Q_E, mask=mask, annot=True,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':10})
plt.show()

#### skew

In [None]:
for f in Q_E:
    train[f] = np.log1p(train[f])
    test[f] = np.log1p(test[f])

### tp

In [None]:
# 0~7 
tp = meta[meta.gbn=='tp'].index
train[tp].describe()
train[tp].head(3)

In [None]:
for f in tp:
    g = sns.FacetGrid(train, col='voted').map(sns.distplot, f)

In [None]:
col_tps = tp.tolist()
col_tps.extend(['voted'])

corr_tp = train[col_tps].corr()

mask = np.zeros_like(corr_tp, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(18, 12))
plt.title('col_tpS', fontsize=18)

sns.heatmap(corr_tp, mask=mask, annot=True,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':10})
plt.show()

In [None]:
# tp: 1,7 극단값 선택한 count 추출, 4 중간값 선택 count 
train['tp17'] = 0
train['tp4'] = 0

test['tp17'] = 0
test['tp4'] = 0

In [None]:
%%time
for f in tp:
    for i, v in enumerate(train[f]):
        if (v == 1 | v == 7):
            train.loc[i, 'tp17'] = train.loc[i, 'tp17']+1
        if v == 4:
            train.loc[i, 'tp4'] = train.loc[i, 'tp4']+1            

In [None]:
%%time
for f in tp:
    for i, v in enumerate(test[f]):
        if (v == 1 | v == 7):
            test.loc[i, 'tp17'] = test.loc[i, 'tp17']+1
        if v == 4:
            test.loc[i, 'tp4'] = test.loc[i, 'tp4']+1            

#### 극단값 중간값

In [None]:
train.columns.values

In [None]:
train['QA15tp17'] = train['Q1'] + train['Q5'] + train['tp17']
train['QA3tp4'] = train['Q3'] + train['tp4']

In [None]:
test['QA15tp17'] = test['Q1'] + test['Q5'] + test['tp17']
test['QA3tp4'] = test['Q3'] + test['tp4']

#### ratio

In [None]:
pd.options.display.max_columns = 100
train

In [None]:
train['Q1'] = train['Q1'] /20
train['Q3'] = train['Q3'] /20
train['Q5'] = train['Q5'] /20

train['tp17'] = train['tp17'] /10
train['tp4'] = train['tp4'] /10

train['QA15tp17'] = train['QA15tp17'] /30
train['QA3tp4'] = train['QA3tp4'] /30

In [None]:
test['Q1'] = test['Q1'] /20
test['Q3'] = test['Q3'] /20
test['Q5'] = test['Q5'] /20

test['tp17'] = test['tp17'] /10
test['tp4'] = test['tp4'] /10

test['QA15tp17'] = test['QA15tp17'] /30
test['QA3tp4'] = test['QA3tp4'] /30

In [None]:
meta.gbn.unique()

In [None]:
# 0/1
wf = meta[meta.gbn=='wf'].index
train[wf].describe()
train[wf].head(3)

In [None]:
# 0/1
wr = meta[meta.gbn=='wr'].index
train[wr].describe()
train[wr].head(3)

In [None]:
corr = train.corr()
corr['voted'].abs().sort_values(ascending=False)[:20]
corr['voted'].abs().sort_values(ascending=False)[60:]

### CAT EDA

In [None]:
meta[meta.gbn == 'cat']

In [None]:
train['voted0'] = train['voted']
train.loc[train.voted0 == 2, 'voted0'] = 0
train.voted0.value_counts()

#### age_group

In [None]:
def cat_summary(col):
    print('colname: ', col)
    print(train[col].value_counts())
    return pd.crosstab(train[col], train['voted0'], margins=True).style.background_gradient(cmap='summer_r')

In [None]:
def cat_plot(col):
    print('colname: ', col)
    return train[[col, 'voted0']].groupby([col], as_index=True).mean().sort_values(by='voted0', ascending=False).plot.bar()

In [None]:
for f in meta[meta.gbn == 'cat'].index:
    cat_summary(f)

In [None]:
for f in meta[meta.gbn == 'cat'].index:
    cat_plot(f)

## Target Encoding

### age_group

In [4]:
train['age_group_int'] = train['age_group'].str.replace('s','').astype('int')
test['age_group_int'] = test['age_group'].str.replace('s','').astype('int')

In [5]:
train['age_group_n_rows'] = train['age_group_int'].map(train.groupby('age_group_int').size())
test['age_group_n_rows'] = test['age_group_int'].map(test.groupby('age_group_int').size())

In [6]:
age_group_mean = train.groupby('age_group_int')['voted'].mean()
train['age_group_mean'] = train['age_group_int'].map(age_group_mean)
test['age_group_mean'] = test['age_group_int'].map(age_group_mean)

In [7]:
# train['voted'].mean() = 1.5468242115435298
def smoothing(n_rows, target_mean): 
    return (target_mean*n_rows + 1.5468242115435298*0.8) / (n_rows + 0.8)

In [8]:
train['age_group_mean_smoothing'] = train.apply(lambda x:smoothing(x['age_group_n_rows'], x['age_group_mean']), axis=1)
test['age_group_mean_smoothing'] = test.apply(lambda x:smoothing(x['age_group_n_rows'], x['age_group_mean']), axis=1)

In [9]:
train.drop(['age_group_int','age_group_mean', 'age_group_n_rows'], axis=1, inplace=True)
test.drop(['age_group_int','age_group_mean', 'age_group_n_rows'], axis=1, inplace=True)

## Outlier

In [None]:
test.familysize.max(), test.familysize.mean(), train[train.familysize < 30]['familysize'].mean()

In [None]:
train[train.familysize > 20][['voted', 'familysize']]

In [10]:
train.loc[train[train.familysize > 25].index, 'familysize'] = 2.6

In [None]:
train['familysize'].value_counts()

## 인코딩

In [None]:
#age_group, gender, race, religion
#train['age_group'] = train['age_group'].str.replace('s','').astype('int')
train['gender']= pd.factorize(train['gender'])[0]
train['race']=pd.factorize(train['race'])[0]
train['religion']=pd.factorize(train['religion'])[0]

#test['age_group'] = test['age_group'].str.replace('s','').astype('int')
test['gender']=pd.factorize(test['gender'])[0]
test['race']=pd.factorize(test['race'])[0]
test['religion']=pd.factorize(test['religion'])[0]

### One-hot

In [None]:
train = pd.get_dummies(train, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
train

In [None]:
test = pd.get_dummies(test, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
test

In [None]:
train.shape, test.shape
train.columns.values
test.columns.values

# 3. 모델 학습

In [11]:
from pycaret.classification import *

In [12]:
import gc
gc.collect()

32

In [13]:
col_cat = meta[(meta.gbn == 'Q_A') | (meta.gbn == 'cat')].index.values.tolist()
col_cat

['QaA',
 'QbA',
 'QcA',
 'QdA',
 'QeA',
 'QfA',
 'QgA',
 'QhA',
 'QiA',
 'QjA',
 'QkA',
 'QlA',
 'QmA',
 'QnA',
 'QoA',
 'QpA',
 'QqA',
 'QrA',
 'QsA',
 'QtA',
 'age_group',
 'education',
 'engnat',
 'familysize',
 'gender',
 'hand',
 'married',
 'race',
 'religion',
 'urban']

In [None]:
col_numeric = ['Q1','Q3','Q5', 'tp17','tp4', 'QA15tp17', 'QA3tp4']
col_numeric

In [None]:
#col_ignore = meta[(meta.gbn == 'wf') | (meta.gbn == 'wr')].index.values.tolist()
#col_ignore = meta[(meta.gbn == 'Q_E')].index.values.tolist()
col_ignore = []

In [14]:
%%time
clf = setup(session_id=42, 
            data = train, target = 'voted', polynomial_features=True
          , categorical_features=col_cat
            #,numeric_features=col_numeric
            #,ignore_features =['voted0']
            ,feature_selection=True
            ,use_gpu=True
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45532, 78)"
4,Missing Values,False
5,Numeric Features,21
6,Categorical Features,56
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 1min 16s


In [15]:
#best_3 = compare_models(sort = 'AUC', n_select = 3)
best_5 = compare_models(sort = 'AUC', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.6926,0.7634,0.6567,0.7499,0.7002,0.3875,0.3912,19.9118
1,Gradient Boosting Classifier,0.6917,0.7622,0.6394,0.7589,0.6939,0.3878,0.3937,26.2112
2,Light Gradient Boosting Machine,0.6922,0.7591,0.6483,0.7543,0.6973,0.3878,0.3925,1.2476
3,Linear Discriminant Analysis,0.6869,0.7586,0.6594,0.7399,0.6972,0.3754,0.3781,1.2126
4,Ada Boost Classifier,0.689,0.7541,0.6532,0.7464,0.6966,0.3805,0.3841,6.5495
5,Extra Trees Classifier,0.6873,0.7539,0.6304,0.7571,0.6879,0.3798,0.3863,3.4057
6,Extreme Gradient Boosting,0.6714,0.7389,0.6636,0.715,0.6883,0.3419,0.343,13.7655
7,Random Forest Classifier,0.6553,0.7108,0.5993,0.723,0.6553,0.3166,0.3222,0.3766
8,Decision Tree Classifier,0.6045,0.6004,0.6436,0.6369,0.6402,0.2011,0.2011,1.9989
9,K Neighbors Classifier,0.5128,0.5078,0.5835,0.5515,0.567,0.011,0.011,2.3693


In [16]:
blended = blend_models(estimator_list = best_5, fold = 10, method = 'soft', optimize='AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6918,0.7642,0.6387,0.7596,0.6939,0.3881,0.3941
1,0.6976,0.7766,0.6659,0.7528,0.7067,0.3971,0.4003
2,0.6792,0.7507,0.6385,0.7392,0.6851,0.3617,0.3658
3,0.6973,0.7608,0.6406,0.7672,0.6982,0.3994,0.4059
4,0.689,0.7582,0.6485,0.749,0.6951,0.3811,0.3852
5,0.6886,0.7595,0.6449,0.7504,0.6937,0.3807,0.3853
6,0.691,0.7685,0.6463,0.7533,0.6958,0.3855,0.3902
7,0.712,0.7801,0.6822,0.7657,0.7215,0.4255,0.4285
8,0.7015,0.7662,0.6643,0.7596,0.7088,0.4053,0.4092
9,0.6944,0.7653,0.6593,0.7514,0.7023,0.3911,0.3946


In [17]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.695,0.7656,0.6529,0.7561,0.7007,0.3932,0.3976


In [18]:
final_model = finalize_model(blended)

In [19]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [20]:
submission['voted'] = predictions['Score']

# 5. 제출

In [21]:
submission.to_csv('output/20201001-1-O-all.csv')