In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score
from pycaret.classification import *

In [2]:
seed = 42
np.random.seed(seed)
set_config('seed', seed)

# 1. 데이터 로드

In [3]:
train=pd.read_csv('input/train.csv', index_col=0)#, na_values='?') #인덱스 col=0도 check!
test=pd.read_csv('input/test.csv', index_col=0)#, na_values='?')
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [4]:
train['income']=pd.factorize(train['income'], sort=True)[0]

# 2. EDA & 전처리

## Missing Value

### education

In [5]:
train.drop(['education'],axis=1, inplace=True)
test.drop(['education'],axis=1, inplace=True)

### capital_gain 99999

In [None]:
df = pd.concat([train, test])
df['capital_gain_imsi'] = df['capital_gain'].replace(99999,0)
capital_gain_imsi = pd.DataFrame(df.groupby(['education_num'])['capital_gain_imsi'].mean()).reset_index()
train = train.merge(capital_gain_imsi, on=['education_num'], how='left')
test = test.merge(capital_gain_imsi, on=['education_num'], how='left')
train.loc[train.capital_gain == 99999, 'capital_gain'] = train['capital_gain_imsi']
test.loc[test.capital_gain == 99999, 'capital_gain'] = test['capital_gain_imsi']
train.drop(['capital_gain_imsi'],axis=1, inplace=True)
test.drop(['capital_gain_imsi'],axis=1, inplace=True)

In [6]:
train.loc[train['capital_gain'] > 3000, 'capital_gain'] = 596.0242247798003
test.loc[test['capital_gain'] > 3000, 'capital_gain'] = 596.0242247798003

In [7]:
train['imsi'] = train.capital_gain*train.hours_per_week#*train.capital_loss
test['imsi'] = test.capital_gain*test.hours_per_week#*train.capital_loss

In [8]:
train.loc[train['capital_loss'] > 3000, 'capital_loss'] = 87.303829734959
test.loc[test['capital_loss'] > 3000, 'capital_loss'] = 87.303829734959

# 3. 모델 학습

In [None]:
import gc
gc.collect()

In [None]:
train.columns

In [9]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'income'
          #, categorical_features=col_cat
          , numeric_features = ['education_num']
          , ignore_features =['fnlwgt']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(26049, 15)"
4,Missing Values,False
5,Numeric Features,7
6,Categorical Features,7
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 8.27 s


In [10]:
best_3 = compare_models(sort = 'F1', n_select = 3
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.8543,0.9124,0.6358,0.7287,0.6788,0.5852,0.5877,0.2566
1,CatBoost Classifier,0.8561,0.9159,0.6258,0.7403,0.6778,0.586,0.5897,12.2197
2,Extreme Gradient Boosting,0.8499,0.9092,0.6338,0.7142,0.6713,0.5745,0.5764,2.2742
3,Gradient Boosting Classifier,0.8553,0.9115,0.6105,0.7462,0.6712,0.5796,0.5847,2.5084
4,Ada Boost Classifier,0.8498,0.9089,0.6034,0.7299,0.6603,0.5651,0.5695,1.0725
5,Linear Discriminant Analysis,0.829,0.8844,0.5733,0.6724,0.6185,0.5093,0.5123,0.2291
6,Random Forest Classifier,0.8289,0.863,0.5691,0.6744,0.6168,0.5077,0.5111,0.128
7,Extra Trees Classifier,0.8207,0.8474,0.5824,0.6442,0.6113,0.4953,0.4966,0.6383


In [11]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8594,0.914,0.6472,0.7394,0.6903,0.5998,0.602
1,0.8512,0.9069,0.6359,0.7172,0.6741,0.5781,0.5799
2,0.8468,0.9118,0.6068,0.717,0.6573,0.5596,0.5628
3,0.8605,0.9197,0.637,0.7486,0.6883,0.5992,0.6024
4,0.8578,0.916,0.657,0.7289,0.6911,0.599,0.6004
Mean,0.8551,0.9137,0.6368,0.7302,0.6802,0.5871,0.5895
SD,0.0053,0.0043,0.0168,0.0124,0.013,0.016,0.0158


In [12]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8609,0.9163,0.6458,0.7454,0.692,0.6028,0.6054


In [None]:
%%time
final_model = finalize_model(blended)

In [None]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [None]:
submission['prediction'] = predictions['Label']

# 5. 제출

In [None]:
submission.to_csv('output/20201031-2.csv')

In [None]:
xgboost = create_model('xgboost')

In [None]:
plot_model(estimator = xgboost, plot = 'feature')

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
plot_model(estimator = lightgbm, plot = 'feature')