In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score
from pycaret.classification import *

In [2]:
seed = 42
np.random.seed(seed)
set_config('seed', seed)

# 1. 데이터 로드

In [3]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test_x.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [None]:
train.shape, test.shape, submission.shape
train.head(3)
test.head(3)
submission.head(3)

# 2. EDA & 전처리

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
train.info

## Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

## 컬럼 분류

In [4]:
data = []

for f in train.columns:
    dtl = ''
    if f == 'voted':
        gubun = 'target'
    elif 'Q' in f:
        dtl = f[1]
        if 'A' in f:
            gubun = 'Q_A'
        if 'E' in f:
            gubun = 'Q_E'
    elif 'tp' in f:           
        gubun = 'tp'            
    elif 'wf' in f:           
        gubun = 'wf'
    elif 'wr' in f:           
        gubun = 'wr'    
    else:
        gubun = 'cat'
        
    f_dict = {
        'col':f,
        'gbn':gubun,
        'dtl':dtl
    }
            
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns =['col', 'gbn', 'dtl'])
meta.set_index('col', inplace=True)
meta

Unnamed: 0_level_0,gbn,dtl
col,Unnamed: 1_level_1,Unnamed: 2_level_1
QaA,Q_A,a
QaE,Q_E,a
QbA,Q_A,b
QbE,Q_E,b
QcA,Q_A,c
...,...,...
wr_09,wr,
wr_10,wr,
wr_11,wr,
wr_12,wr,


### Q_A

In [5]:
Q_A = meta[meta.gbn=='Q_A'].index

train[Q_A].describe()
train[Q_A].head(3)

Unnamed: 0,QaA,QbA,QcA,QdA,QeA,QfA,QgA,QhA,QiA,QjA,QkA,QlA,QmA,QnA,QoA,QpA,QqA,QrA,QsA,QtA
count,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0
mean,2.129535,2.904463,3.662347,1.749078,2.317952,2.168145,2.560595,2.317952,3.157691,3.575529,3.813933,4.325727,3.50962,2.632896,3.217232,3.071752,2.840464,2.823487,3.432948,3.225687
std,1.196952,1.566142,1.431494,1.043625,1.369205,1.348653,1.5106,1.460813,1.48174,1.36122,1.372551,1.146992,1.375134,1.473022,1.476414,1.489744,1.404342,1.424383,1.374938,1.521603
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,2.0,3.0,4.0,1.0,2.0,2.0,2.0,2.0,3.0,4.0,4.0,5.0,4.0,2.0,4.0,3.0,3.0,3.0,4.0,4.0
75%,3.0,4.0,5.0,2.0,3.0,3.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


Unnamed: 0_level_0,QaA,QbA,QcA,QdA,QeA,QfA,QgA,QhA,QiA,QjA,QkA,QlA,QmA,QnA,QoA,QpA,QqA,QrA,QsA,QtA
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,3.0,4.0,5.0,1.0,2.0,5.0,2.0,4.0,5.0,4.0,4.0,4.0,2.0,5.0,2.0,2.0,2.0,2.0,2.0,1.0
1,5.0,5.0,3.0,5.0,1.0,3.0,1.0,1.0,5.0,3.0,5.0,4.0,1.0,5.0,1.0,5.0,5.0,1.0,4.0,1.0
2,4.0,1.0,1.0,4.0,5.0,1.0,4.0,1.0,3.0,2.0,5.0,5.0,2.0,4.0,2.0,1.0,5.0,4.0,1.0,1.0


### Q_A 1/5 count

In [6]:
# Q_A: 1~5 1/5 극단값 선택한 count 추출
train['Q_A1'] = 0
train['Q_A3'] = 0
train['Q_A5'] = 0

test['Q_A1'] = 0
test['Q_A3'] = 0
test['Q_A5'] = 0

In [7]:
%%time
for f in Q_A:
    for i, v in enumerate(train[f]):
        if v == 1:
            train.loc[i, 'Q_A1'] = train.loc[i, 'Q_A1']+1
        elif v == 3:
            train.loc[i, 'Q_A3'] = train.loc[i, 'Q_A3']+1
        elif v == 5:
            train.loc[i, 'Q_A5'] = train.loc[i, 'Q_A5']+1            

Wall time: 2min 41s


In [8]:
%%time
for f in Q_A:
    for i, v in enumerate(test[f]):
        if v == 1:
            test.loc[i, 'Q_A1'] = test.loc[i, 'Q_A1']+1
        elif v == 3:
            test.loc[i, 'Q_A3'] = test.loc[i, 'Q_A3']+1            
        elif v == 5:
            test.loc[i, 'Q_A5'] = test.loc[i, 'Q_A5']+1                        

Wall time: 34.3 s


### Q_E

In [25]:
Q_E = meta[meta.gbn == 'Q_E'].index

train[Q_E].describe()
train[Q_E].head()

Unnamed: 0,QaE,QbE,QcE,QdE,QeE,QfE,QgE,QhE,QiE,QjE,QkE,QlE,QmE,QnE,QoE,QpE,QqE,QrE,QsE,QtE
count,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0,45524.0
mean,945.4113,2189.746,1484.279677,-1490.783,-1899.36,-1850.747,1284.509,4584.571,-4068.621,2576.648,-2038.927,-3741.335,2050.478,-959.87053,1435.858,1633.567,-1154.297,-2471.007,1163.993,-2240.197
std,13076.79,33513.21,8978.442356,10923.55,16709.11,76089.04,9757.418,502070.6,114183.8,54119.07,30694.33,366399.4,75314.48,5870.444064,10850.92,26004.49,22508.85,148302.0,7434.122,67315.15
min,25.0,25.0,25.0,-1552821.0,-1919926.0,-11763700.0,25.0,0.0,-17035860.0,0.0,-4824953.0,-77543510.0,25.0,-606694.0,0.0,0.0,-4277510.0,-31593280.0,25.0,-11556500.0
25%,404.0,875.0,651.0,-1355.0,-1656.0,-1078.0,549.0,1077.0,-2760.0,902.0,-1721.0,-1649.0,650.0,-891.0,638.0,706.0,-982.0,-1664.25,527.0,-1526.0
50%,557.0,1218.0,899.0,-931.0,-1154.0,-712.0,772.0,1464.0,-1800.0,1260.0,-1178.0,-1125.0,906.0,-596.0,883.0,970.0,-652.0,-1117.0,747.0,-1020.0
75%,827.0,1838.0,1335.0,-679.75,-834.0,-504.0,1149.25,2089.0,-1259.0,1899.0,-859.0,-806.0,1342.0,-428.0,1300.0,1409.0,-461.0,-794.0,1124.0,-733.0
max,2413960.0,5580395.0,871557.0,-26.0,-25.0,-25.0,1068252.0,107086000.0,0.0,9910006.0,0.0,-25.0,14108320.0,-25.0,1309739.0,4650742.0,0.0,-25.0,1286581.0,-25.0


Unnamed: 0_level_0,QaE,QbE,QcE,QdE,QeE,QfE,QgE,QhE,QiE,QjE,QkE,QlE,QmE,QnE,QoE,QpE,QqE,QrE,QsE,QtE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,363,1370,997,-1024,-1577,-539,586,1095,-1142,1287,-883,-851,851,-816,579,924,-366,-876,633,-1115
1,647,1313,3387,-2969,-4320,-2190,826,4082,-1867,1264,-2943,-3927,4329,-1828,1214,2414,-1356,-3039,4304,-1346
2,1623,1480,1021,-3374,-1333,-531,1167,1016,-2653,1569,-998,-2547,918,-2153,1304,1131,-937,-1327,1170,-1409
3,504,2311,992,-3245,-357,-1519,159,2275,-2809,5614,-3219,-1296,9046,-1216,1169,23868,-581,-8830,2392,-1312
4,927,707,556,-1062,-1014,-628,991,1259,-1153,1388,-740,-1181,547,-575,754,1140,-323,-1070,583,-1889


In [23]:
train['QdE'] = train['QdE']*-1
train['QeE'] = train['QeE']*-1
train['QfE'] = train['QfE']*-1
train['QiE'] = train['QiE']*-1
train['QkE'] = train['QkE']*-1
train['QlE'] = train['QlE']*-1
train['QnE'] = train['QnE']*-1
train['QqE'] = train['QqE']*-1
train['QrE'] = train['QrE']*-1
train['QtE'] = train['QtE']*-1 

In [24]:
test['QdE'] = test['QdE']*-1
test['QeE'] = test['QeE']*-1
test['QfE'] = test['QfE']*-1
test['QiE'] = test['QiE']*-1
test['QkE'] = test['QkE']*-1
test['QlE'] = test['QlE']*-1
test['QnE'] = test['QnE']*-1
test['QqE'] = test['QqE']*-1
test['QrE'] = test['QrE']*-1
test['QtE'] = test['QtE']*-1 

### tp

In [10]:
# 0~7 
tp = meta[meta.gbn=='tp'].index

train[tp].describe()
train[tp].head(3)

Unnamed: 0,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10
count,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0,45532.0
mean,3.017241,2.621189,1.768712,3.041246,1.335918,2.472217,2.031099,3.507226,2.277497,4.317996
std,2.007713,1.897081,1.685069,2.037278,1.452544,2.000658,1.785824,1.971456,1.848783,1.722793
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,1.0,0.0,1.0,1.0,2.0,1.0,3.0
50%,3.0,2.0,1.0,3.0,1.0,2.0,2.0,4.0,2.0,5.0
75%,5.0,4.0,3.0,5.0,2.0,4.0,3.0,5.0,4.0,6.0
max,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


Unnamed: 0_level_0,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,2,2,1,2,1,7,4,4,3
1,1,1,0,0,1,2,3,4,0,4
2,2,3,1,5,3,4,2,6,1,3


In [11]:
# tp: 1,7 극단값 선택한 count 추출, 4 중간값 선택 count 
train['tp17'] = 0
train['tp4'] = 0

test['tp17'] = 0
test['tp4'] = 0

In [12]:
%%time
for f in tp:
    for i, v in enumerate(train[f]):
        if (v == 1 | v == 7):
            train.loc[i, 'tp17'] = train.loc[i, 'tp17']+1
        if v == 4:
            train.loc[i, 'tp4'] = train.loc[i, 'tp4']+1            

Wall time: 14.8 s


In [13]:
%%time
for f in tp:
    for i, v in enumerate(test[f]):
        if (v == 1 | v == 7):
            test.loc[i, 'tp17'] = test.loc[i, 'tp17']+1
        if v == 4:
            test.loc[i, 'tp4'] = test.loc[i, 'tp4']+1            

Wall time: 3.42 s


#### 극단값 중간값

In [14]:
train['Q_A15tp17'] = train['Q_A1'] + train['Q_A5'] + train['tp17']
train['Q_A3tp4'] = train['Q_A3'] + train['tp4']

In [15]:
test['Q_A15tp17'] = test['Q_A1'] + test['Q_A5'] + test['tp17']
test['Q_A3tp4'] = test['Q_A3'] + test['tp4']

In [None]:
# 0/1
wf = meta[meta.gbn=='wf'].index
train[wf].describe()
train[wf].head(3)

In [None]:
# 0/1
wr = meta[meta.gbn=='wr'].index
train[wr].describe()
train[wr].head(3)

## Target Encoding

### age_group

In [16]:
train['age_group_int'] = train['age_group'].str.replace('s','').astype('int')
test['age_group_int'] = test['age_group'].str.replace('s','').astype('int')

In [17]:
train['age_group_n_rows'] = train['age_group_int'].map(train.groupby('age_group_int').size())
test['age_group_n_rows'] = test['age_group_int'].map(test.groupby('age_group_int').size())

In [18]:
age_group_mean = train.groupby('age_group_int')['voted'].mean()
train['age_group_mean'] = train['age_group_int'].map(age_group_mean)
test['age_group_mean'] = test['age_group_int'].map(age_group_mean)

In [19]:
# train['voted'].mean() = 1.5468242115435298
def smoothing(n_rows, target_mean): 
    return (target_mean*n_rows + 1.5468242115435298*0.8) / (n_rows + 0.8)

In [20]:
train['age_group_mean_smoothing'] = train.apply(lambda x:smoothing(x['age_group_n_rows'], x['age_group_mean']), axis=1)
test['age_group_mean_smoothing'] = test.apply(lambda x:smoothing(x['age_group_n_rows'], x['age_group_mean']), axis=1)

In [21]:
train.drop(['age_group_int','age_group_mean', 'age_group_n_rows'], axis=1, inplace=True)
test.drop(['age_group_int','age_group_mean', 'age_group_n_rows'], axis=1, inplace=True)

## Outlier

In [22]:
train = train[train.familysize < 25]

# 3. 모델 학습

In [26]:
import gc
gc.collect()

32

In [None]:
train.columns

In [27]:
col_cat = meta[(meta.gbn == 'Q_A') | (meta.gbn == 'cat')].index.values.tolist()
col_cat

['QaA',
 'QbA',
 'QcA',
 'QdA',
 'QeA',
 'QfA',
 'QgA',
 'QhA',
 'QiA',
 'QjA',
 'QkA',
 'QlA',
 'QmA',
 'QnA',
 'QoA',
 'QpA',
 'QqA',
 'QrA',
 'QsA',
 'QtA',
 'age_group',
 'education',
 'engnat',
 'familysize',
 'gender',
 'hand',
 'married',
 'race',
 'religion',
 'urban']

In [28]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'voted'
          , categorical_features=col_cat
          , numeric_features = ['Q_A1','Q_A3','Q_A5','tp17','tp4','Q_A15tp17','Q_A3tp4']
          #, ignore_features =['']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45524, 85)"
4,Missing Values,False
5,Numeric Features,28
6,Categorical Features,56
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 26.4 s


In [29]:
best_5 = compare_models(sort = 'AUC', n_select = 5
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.6959,0.7677,0.6493,0.7597,0.7001,0.3955,0.4006,21.0841
1,CatBoost Classifier,0.6929,0.7643,0.6565,0.7507,0.7004,0.3883,0.392,18.7859
2,Light Gradient Boosting Machine,0.6911,0.7635,0.649,0.7521,0.6967,0.3854,0.3897,1.0837
3,Linear Discriminant Analysis,0.6903,0.7626,0.6634,0.7428,0.7008,0.382,0.3846,1.0427
4,Extra Trees Classifier,0.6907,0.7607,0.6415,0.7558,0.6939,0.3855,0.3908,2.8497
5,Ada Boost Classifier,0.6894,0.7585,0.6555,0.7457,0.6977,0.381,0.3843,5.2712
6,Extreme Gradient Boosting,0.674,0.7444,0.6665,0.7173,0.6909,0.347,0.3481,12.0034
7,Random Forest Classifier,0.661,0.7153,0.6075,0.7276,0.6621,0.3274,0.3329,0.3273


In [30]:
blended = blend_models(estimator_list = best_5, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6986,0.7682,0.657,0.7594,0.7045,0.4002,0.4045
1,0.7048,0.7731,0.6635,0.7655,0.7109,0.4124,0.4168
2,0.6927,0.7726,0.6507,0.7536,0.6983,0.3885,0.3929
3,0.6956,0.7649,0.6548,0.7558,0.7017,0.3942,0.3984
4,0.6924,0.7678,0.649,0.7542,0.6977,0.3883,0.3928
Mean,0.6968,0.7693,0.655,0.7577,0.7026,0.3967,0.4011
SD,0.0046,0.0031,0.0051,0.0044,0.0048,0.009,0.009


In [31]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.6957,0.7673,0.6509,0.7584,0.7006,0.3949,0.3997


In [32]:
final_model = finalize_model(blended)

In [33]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [34]:
submission['voted'] = predictions['Score']

# 5. 제출

In [35]:
submission.to_csv('output/20201006-1.csv')

In [None]:
gbc = create_model('gbc')

In [None]:
plot_model(estimator = gbc, plot = 'feature')

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
plot_model(estimator = lightgbm, plot = 'feature')