In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(42)

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score

# 1. 데이터 로드

In [2]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test_x.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [None]:
train.shape, test.shape, submission.shape
train.head(3)
test.head(3)
submission.head(3)

# 2. EDA & 전처리

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
train.info

## Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

## 컬럼 분류

In [3]:
data = []

for f in train.columns:
    dtl = ''
    if f == 'voted':
        gubun = 'target'
    elif 'Q' in f:
        dtl = f[1]
        if 'A' in f:
            gubun = 'Q_A'
        if 'E' in f:
            gubun = 'Q_E'
    elif 'tp' in f:           
        gubun = 'tp'            
    elif 'wf' in f:           
        gubun = 'wf'
    elif 'wr' in f:           
        gubun = 'wr'    
    else:
        gubun = 'cat'
        
    f_dict = {
        'col':f,
        'gbn':gubun,
        'dtl':dtl
    }
            
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns =['col', 'gbn', 'dtl'])
meta.set_index('col', inplace=True)
meta

Unnamed: 0_level_0,gbn,dtl
col,Unnamed: 1_level_1,Unnamed: 2_level_1
QaA,Q_A,a
QaE,Q_E,a
QbA,Q_A,b
QbE,Q_E,b
QcA,Q_A,c
...,...,...
wr_09,wr,
wr_10,wr,
wr_11,wr,
wr_12,wr,


### Q_A

In [4]:
Q_A = meta[meta.gbn=='Q_A'].index

train[Q_A].describe()
train[Q_A].head(3)

### Q_A 1/5 count

In [5]:
# Q_A: 1~5 1/5 극단값 선택한 count 추출
train['Q_A1'] = 0
train['Q_A3'] = 0
train['Q_A5'] = 0

test['Q_A1'] = 0
test['Q_A3'] = 0
test['Q_A5'] = 0

In [6]:
%%time
for f in Q_A:
    for i, v in enumerate(train[f]):
        if v == 1:
            train.loc[i, 'Q_A1'] = train.loc[i, 'Q_A1']+1
        elif v == 3:
            train.loc[i, 'Q_A3'] = train.loc[i, 'Q_A3']+1
        elif v == 5:
            train.loc[i, 'Q_A5'] = train.loc[i, 'Q_A5']+1            

Wall time: 2min 34s


In [7]:
%%time
for f in Q_A:
    for i, v in enumerate(test[f]):
        if v == 1:
            test.loc[i, 'Q_A1'] = test.loc[i, 'Q_A1']+1
        elif v == 3:
            test.loc[i, 'Q_A3'] = test.loc[i, 'Q_A3']+1            
        elif v == 5:
            test.loc[i, 'Q_A5'] = test.loc[i, 'Q_A5']+1                        

Wall time: 35.1 s


### Q_E

In [8]:
Q_E = meta[meta.gbn == 'Q_E'].index

train[Q_E].describe()
train[Q_E].head()

#### skew

In [9]:
for f in Q_E:
    train[f] = np.log1p(train[f])
    test[f] = np.log1p(test[f])

### tp

In [10]:
# 0~7 
tp = meta[meta.gbn=='tp'].index

train[tp].describe()
train[tp].head(3)

In [11]:
# tp: 1,7 극단값 선택한 count 추출, 4 중간값 선택 count 
train['tp17'] = 0
train['tp4'] = 0

test['tp17'] = 0
test['tp4'] = 0

In [12]:
%%time
for f in tp:
    for i, v in enumerate(train[f]):
        if (v == 1 | v == 7):
            train.loc[i, 'tp17'] = train.loc[i, 'tp17']+1
        if v == 4:
            train.loc[i, 'tp4'] = train.loc[i, 'tp4']+1            

Wall time: 15.1 s


In [13]:
%%time
for f in tp:
    for i, v in enumerate(test[f]):
        if (v == 1 | v == 7):
            test.loc[i, 'tp17'] = test.loc[i, 'tp17']+1
        if v == 4:
            test.loc[i, 'tp4'] = test.loc[i, 'tp4']+1            

Wall time: 3.41 s


#### 극단값 중간값

In [14]:
train['Q_A15tp17'] = train['Q_A1'] + train['Q_A5'] + train['tp17']
train['Q_A3tp4'] = train['Q_A3'] + train['tp4']

In [15]:
test['Q_A15tp17'] = test['Q_A1'] + test['Q_A5'] + test['tp17']
test['Q_A3tp4'] = test['Q_A3'] + test['tp4']

#### ratio

In [16]:
train['Q_A1'] = train['Q_A1'] /20
train['Q_A3'] = train['Q_A3'] /20
train['Q_A5'] = train['Q_A5'] /20

train['tp17'] = train['tp17'] /10
train['tp4'] = train['tp4'] /10

train['Q_A15tp17'] = train['Q_A15tp17'] /30
train['Q_A3tp4'] = train['Q_A3tp4'] /30

In [17]:
test['Q_A1'] = test['Q_A1'] /20
test['Q_A3'] = test['Q_A3'] /20
test['Q_A5'] = test['Q_A5'] /20

test['tp17'] = test['tp17'] /10
test['tp4'] = test['tp4'] /10

test['Q_A15tp17'] = test['Q_A15tp17'] /30
test['Q_A3tp4'] = test['Q_A3tp4'] /30

In [None]:
# 0/1
wf = meta[meta.gbn=='wf'].index
train[wf].describe()
train[wf].head(3)

In [None]:
# 0/1
wr = meta[meta.gbn=='wr'].index
train[wr].describe()
train[wr].head(3)

## Target Encoding

### age_group

In [18]:
train['age_group_int'] = train['age_group'].str.replace('s','').astype('int')
test['age_group_int'] = test['age_group'].str.replace('s','').astype('int')

In [19]:
train['age_group_n_rows'] = train['age_group_int'].map(train.groupby('age_group_int').size())
test['age_group_n_rows'] = test['age_group_int'].map(test.groupby('age_group_int').size())

In [20]:
age_group_mean = train.groupby('age_group_int')['voted'].mean()
train['age_group_mean'] = train['age_group_int'].map(age_group_mean)
test['age_group_mean'] = test['age_group_int'].map(age_group_mean)

In [21]:
# train['voted'].mean() = 1.5468242115435298
def smoothing(n_rows, target_mean): 
    return (target_mean*n_rows + 1.5468242115435298*0.8) / (n_rows + 0.8)

In [22]:
train['age_group_mean_smoothing'] = train.apply(lambda x:smoothing(x['age_group_n_rows'], x['age_group_mean']), axis=1)
test['age_group_mean_smoothing'] = test.apply(lambda x:smoothing(x['age_group_n_rows'], x['age_group_mean']), axis=1)

In [23]:
train.drop(['age_group_int','age_group_mean', 'age_group_n_rows'], axis=1, inplace=True)
test.drop(['age_group_int','age_group_mean', 'age_group_n_rows'], axis=1, inplace=True)

## Outlier

In [24]:
train.loc[train[train.familysize > 25].index, 'familysize'] = 2.6

## 인코딩

In [25]:
#age_group, gender, race, religion
#train['age_group'] = train['age_group'].str.replace('s','').astype('int')
train['gender']= pd.factorize(train['gender'])[0]
train['race']=pd.factorize(train['race'])[0]
train['religion']=pd.factorize(train['religion'])[0]

#test['age_group'] = test['age_group'].str.replace('s','').astype('int')
test['gender']=pd.factorize(test['gender'])[0]
test['race']=pd.factorize(test['race'])[0]
test['religion']=pd.factorize(test['religion'])[0]

### One-hot

In [None]:
train = pd.get_dummies(train, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
train

In [None]:
test = pd.get_dummies(test, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
test

In [31]:
train.shape, test.shape
train.columns.values
test.columns.values

((45532, 491), (11383, 490))

array(['QaA', 'QaE', 'QbA', 'QbE', 'QcA', 'QcE', 'QdA', 'QdE', 'QeA',
       'QeE', 'QfA', 'QfE', 'QgA', 'QgE', 'QhA', 'QhE', 'QiA', 'QiE',
       'QjA', 'QjE', 'QkA', 'QkE', 'QlA', 'QlE', 'QmA', 'QmE', 'QnA',
       'QnE', 'QoA', 'QoE', 'QpA', 'QpE', 'QqA', 'QqE', 'QrA', 'QrE',
       'QsA', 'QsE', 'QtA', 'QtE', 'age_group', 'education', 'engnat',
       'familysize', 'gender', 'hand', 'married', 'race', 'religion',
       'tp01', 'tp02', 'tp03', 'tp04', 'tp05', 'tp06', 'tp07', 'tp08',
       'tp09', 'tp10', 'urban', 'voted', 'wf_01', 'wf_02', 'wf_03',
       'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06', 'wr_07',
       'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13', 'Q_A1',
       'Q_A3', 'Q_A5', 'tp17', 'tp4', 'Q_A15tp17', 'Q_A3tp4',
       'age_group_mean_smoothing', 'QaE^2', 'QaE QbE', 'QaE QcE',
       'QaE QdE', 'QaE QeE', 'QaE QfE', 'QaE QgE', 'QaE QhE', 'QaE QiE',
       'QaE QjE', 'QaE QkE', 'QaE QlE', 'QaE QmE', 'QaE QnE', 'QaE QoE',
       'QaE QpE', 'QaE QqE'

array(['QaA', 'QaE', 'QbA', 'QbE', 'QcA', 'QcE', 'QdA', 'QdE', 'QeA',
       'QeE', 'QfA', 'QfE', 'QgA', 'QgE', 'QhA', 'QhE', 'QiA', 'QiE',
       'QjA', 'QjE', 'QkA', 'QkE', 'QlA', 'QlE', 'QmA', 'QmE', 'QnA',
       'QnE', 'QoA', 'QoE', 'QpA', 'QpE', 'QqA', 'QqE', 'QrA', 'QrE',
       'QsA', 'QsE', 'QtA', 'QtE', 'age_group', 'education', 'engnat',
       'familysize', 'gender', 'hand', 'married', 'race', 'religion',
       'tp01', 'tp02', 'tp03', 'tp04', 'tp05', 'tp06', 'tp07', 'tp08',
       'tp09', 'tp10', 'urban', 'wf_01', 'wf_02', 'wf_03', 'wr_01',
       'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06', 'wr_07', 'wr_08',
       'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13', 'Q_A1', 'Q_A3',
       'Q_A5', 'tp17', 'tp4', 'Q_A15tp17', 'Q_A3tp4',
       'age_group_mean_smoothing', 'QaE^2', 'QaE QbE', 'QaE QcE',
       'QaE QdE', 'QaE QeE', 'QaE QfE', 'QaE QgE', 'QaE QhE', 'QaE QiE',
       'QaE QjE', 'QaE QkE', 'QaE QlE', 'QaE QmE', 'QaE QnE', 'QaE QoE',
       'QaE QpE', 'QaE QqE', 'QaE Qr

In [28]:
v = meta[meta.gbn=='Q_E'].index.values.tolist()
v.extend(['Q_A1', 'Q_A3',
       'Q_A5', 'tp17', 'tp4', 'Q_A15tp17', 'Q_A3tp4',
       'age_group_mean_smoothing'])
v

['QaE',
 'QbE',
 'QcE',
 'QdE',
 'QeE',
 'QfE',
 'QgE',
 'QhE',
 'QiE',
 'QjE',
 'QkE',
 'QlE',
 'QmE',
 'QnE',
 'QoE',
 'QpE',
 'QqE',
 'QrE',
 'QsE',
 'QtE',
 'Q_A1',
 'Q_A3',
 'Q_A5',
 'tp17',
 'tp4',
 'Q_A15tp17',
 'Q_A3tp4',
 'age_group_mean_smoothing']

In [29]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
interactions = pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
interactions.drop(v, axis=1, inplace=True) 
train = pd.concat([train, interactions], axis=1)

In [30]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
interactions = pd.DataFrame(data=poly.fit_transform(test[v]), columns=poly.get_feature_names(v))
interactions.drop(v, axis=1, inplace=True) 
test = pd.concat([test, interactions], axis=1)

# 3. 모델 학습

In [32]:
from pycaret.classification import *

In [37]:
set_config('seed', 42)

In [40]:
import gc
gc.collect()

196

In [34]:
col_cat = meta[(meta.gbn == 'Q_A') | (meta.gbn == 'cat')].index.values.tolist()
col_cat

['QaA',
 'QbA',
 'QcA',
 'QdA',
 'QeA',
 'QfA',
 'QgA',
 'QhA',
 'QiA',
 'QjA',
 'QkA',
 'QlA',
 'QmA',
 'QnA',
 'QoA',
 'QpA',
 'QqA',
 'QrA',
 'QsA',
 'QtA',
 'age_group',
 'education',
 'engnat',
 'familysize',
 'gender',
 'hand',
 'married',
 'race',
 'religion',
 'urban']

In [38]:
%%time
clf = setup(session_id=42, 
            data = train, target = 'voted'
          , categorical_features=col_cat
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45532, 491)"
4,Missing Values,False
5,Numeric Features,434
6,Categorical Features,56
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 55.8 s


In [41]:
best_5 = compare_models(sort = 'AUC', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.6925,0.7631,0.6328,0.7643,0.6923,0.3904,0.3975,304.8568
1,CatBoost Classifier,0.6904,0.7613,0.6497,0.7505,0.6964,0.3838,0.388,85.4254
2,Light Gradient Boosting Machine,0.6929,0.7612,0.6425,0.7589,0.6958,0.3901,0.3957,10.6799
3,Linear Discriminant Analysis,0.688,0.757,0.665,0.7384,0.6997,0.3768,0.3791,4.4843
4,Ada Boost Classifier,0.6895,0.754,0.6511,0.7484,0.6963,0.3818,0.3857,61.718
5,Extra Trees Classifier,0.6814,0.748,0.6156,0.7565,0.6787,0.3693,0.3771,7.9966
6,Extreme Gradient Boosting,0.6705,0.7372,0.6622,0.7144,0.6872,0.3401,0.3412,76.6419
7,Logistic Regression,0.6768,0.7364,0.6101,0.7521,0.6736,0.3603,0.3682,3.1946
8,Naive Bayes,0.6667,0.7162,0.6315,0.7236,0.6743,0.3362,0.3396,0.4514
9,Random Forest Classifier,0.6489,0.6999,0.5942,0.7155,0.6491,0.3038,0.3091,1.6337


In [42]:
blended = blend_models(estimator_list = best_5, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6943,0.7706,0.6489,0.7573,0.6989,0.3921,0.397
1,0.6855,0.7542,0.6268,0.7563,0.6855,0.3764,0.3831
2,0.7016,0.7673,0.6601,0.7624,0.7076,0.4062,0.4106
3,0.6941,0.7606,0.6433,0.7603,0.6969,0.3924,0.398
4,0.6945,0.7709,0.6551,0.754,0.7011,0.3919,0.396
Mean,0.694,0.7647,0.6468,0.758,0.698,0.3918,0.3969
SD,0.0051,0.0064,0.0115,0.003,0.0072,0.0094,0.0087


In [43]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.6955,0.768,0.66,0.7527,0.7033,0.3933,0.3968


In [44]:
final_model = finalize_model(blended)

In [45]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [46]:
submission['voted'] = predictions['Score']

# 5. 제출

In [47]:
submission.to_csv('output/20201002-1-all.csv')