In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score
from pycaret.classification import *

In [2]:
seed = 42
np.random.seed(seed)
set_config('seed', seed)

# 1. 데이터 로드

In [3]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

## target 

In [4]:
train['income']=pd.factorize(train['income'], sort=True)[0]

In [None]:
train.shape, test.shape

In [None]:
train.head(3)
test.head(3)

# 2. EDA & 전처리

In [None]:
train.info()
train.describe()
train.describe(include='O')

## Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

In [None]:
train.info()

### outlier

In [5]:
train.drop(['education'],axis=1, inplace=True)
test.drop(['education'],axis=1, inplace=True)

In [6]:
idx = train.fnlwgt.sort_values(ascending=False)[:3].index
train.drop(idx, axis=0, inplace=True)

# 3. 모델 학습

In [None]:
import gc
gc.collect()

In [None]:
train.columns

In [7]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'income'
          , categorical_features=['age']
          , numeric_features = ['education_num']
          #, ignore_features =tp.tolist() + wr.tolist() + wf.tolist()#'religion', 'race_1', 'race_2', 'race_3','race_4', 'race_5', 'race_6']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(26046, 14)"
4,Missing Values,False
5,Numeric Features,5
6,Categorical Features,8
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 11.5 s


In [8]:
best_3 = compare_models(sort = 'F1', n_select = 3
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.8638,0.9205,0.6494,0.7542,0.6976,0.6104,0.6134,0.3199
1,Extreme Gradient Boosting,0.8631,0.9193,0.6436,0.7551,0.6947,0.6072,0.6106,3.674
2,CatBoost Classifier,0.8644,0.9211,0.6296,0.7693,0.6919,0.6062,0.6116,12.4143
3,Gradient Boosting Classifier,0.8579,0.9133,0.5795,0.7766,0.6635,0.5759,0.5859,3.306
4,Ada Boost Classifier,0.8536,0.9079,0.5863,0.7547,0.6597,0.5684,0.5758,1.1672
5,Linear Discriminant Analysis,0.8393,0.8947,0.594,0.6972,0.6412,0.5386,0.5416,0.3067
6,Extra Trees Classifier,0.8317,0.8735,0.5937,0.673,0.6306,0.5222,0.5241,0.8697
7,Random Forest Classifier,0.8409,0.8776,0.5597,0.7211,0.6301,0.5307,0.5378,0.1297


In [9]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8598,0.9133,0.6408,0.7444,0.6887,0.5989,0.6017
1,0.8645,0.9239,0.6408,0.7615,0.696,0.6096,0.6134
2,0.8679,0.9317,0.6451,0.7713,0.7026,0.6186,0.6226
3,0.8629,0.9201,0.6375,0.7577,0.6924,0.605,0.6088
4,0.8664,0.9214,0.6408,0.7689,0.699,0.6141,0.6183
Mean,0.8643,0.9221,0.641,0.7608,0.6957,0.6092,0.6129
SD,0.0028,0.0059,0.0024,0.0096,0.0049,0.0069,0.0073


In [10]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8761,0.9303,0.6858,0.7761,0.7281,0.6483,0.6503


In [11]:
%%time
final_model = finalize_model(blended)

Wall time: 4min 20s


In [12]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [13]:
submission['prediction'] = predictions['Label']

# 5. 제출

In [14]:
submission.to_csv('output/20201015-5.csv')

In [None]:
gbc = create_model('gbc')

In [None]:
plot_model(estimator = gbc, plot = 'feature')

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
plot_model(estimator = lightgbm, plot = 'feature')