In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score
from pycaret.classification import *

In [2]:
seed = 42
np.random.seed(seed)
set_config('seed', seed)

# 1. 데이터 로드

In [3]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [4]:
train['income']=pd.factorize(train['income'], sort=True)[0]

In [5]:
train.shape, test.shape

((26049, 15), (6512, 14))

# 2. EDA & 전처리

## Missing Value

In [6]:
train.isnull().sum().sum()
test.isnull().sum().sum()

0

0

### outlier

### education

In [7]:
train.drop(['education'],axis=1, inplace=True)
test.drop(['education'],axis=1, inplace=True)

### capital_gain 99999

In [8]:
df = pd.concat([train, test])
df['capital_gain_imsi'] = df['capital_gain'].replace(99999,0)
capital_gain_imsi = pd.DataFrame(df.groupby(['education_num'])['capital_gain_imsi'].mean()).reset_index()
train = train.merge(capital_gain_imsi, on=['education_num'], how='left')
test = test.merge(capital_gain_imsi, on=['education_num'], how='left')
train.loc[train.capital_gain == 99999, 'capital_gain'] = train['capital_gain_imsi']
test.loc[test.capital_gain == 99999, 'capital_gain'] = test['capital_gain_imsi']
train.drop(['capital_gain_imsi'],axis=1, inplace=True)
test.drop(['capital_gain_imsi'],axis=1, inplace=True)

In [9]:
num = 5
train['age'] = pd.qcut(train.age, num)
test['age'] = pd.qcut(test.age, num)

# 3. 모델 학습

In [10]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'income'
          #, categorical_features=col_cat
          , numeric_features = ['education_num']
          #, ignore_features =tp.tolist() + wr.tolist() + wf.tolist()#'religion', 'race_1', 'race_2', 'race_3','race_4', 'race_5', 'race_6']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(26049, 14)"
4,Missing Values,False
5,Numeric Features,5
6,Categorical Features,8
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 8.14 s


In [11]:
best_3 = compare_models(sort = 'F1', n_select = 3
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.8659,0.9203,0.6504,0.7612,0.7013,0.6156,0.6188,0.2975
1,Extreme Gradient Boosting,0.8627,0.919,0.652,0.7486,0.6968,0.6086,0.6111,2.2447
2,CatBoost Classifier,0.8651,0.9227,0.6322,0.7696,0.6939,0.6085,0.6135,10.8321
3,Ada Boost Classifier,0.8571,0.9109,0.618,0.7484,0.6766,0.586,0.5907,0.7717
4,Gradient Boosting Classifier,0.8602,0.916,0.6002,0.7719,0.675,0.5878,0.5955,2.1682
5,Linear Discriminant Analysis,0.8487,0.9027,0.5995,0.7282,0.6571,0.5613,0.566,0.1884
6,Extra Trees Classifier,0.8253,0.8656,0.5866,0.6559,0.619,0.5061,0.5077,0.5328
7,Random Forest Classifier,0.833,0.8658,0.5581,0.6932,0.6178,0.5127,0.5179,0.1236


In [12]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8602,0.9203,0.6294,0.7524,0.6855,0.5965,0.6004
1,0.8621,0.9164,0.6424,0.7519,0.6928,0.6047,0.6078
2,0.8559,0.9208,0.6214,0.7413,0.6761,0.5843,0.588
3,0.8754,0.9308,0.6467,0.7996,0.7151,0.6365,0.6423
4,0.8742,0.9232,0.6715,0.7786,0.7211,0.6405,0.6434
Mean,0.8656,0.9223,0.6423,0.7648,0.6981,0.6125,0.6164
SD,0.0078,0.0048,0.0172,0.0213,0.0173,0.0222,0.0225


In [13]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8699,0.9285,0.6495,0.7762,0.7072,0.6244,0.6285


In [14]:
%%time
final_model = finalize_model(blended)

Wall time: 3min 10s


In [15]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [16]:
submission['prediction'] = predictions['Label']

# 5. 제출

In [17]:
submission.to_csv('output/20201016-5.csv')

In [None]:
gbc = create_model('gbc')

In [None]:
plot_model(estimator = gbc, plot = 'feature')

In [None]:
xgboost = create_model('xgboost')

In [None]:
plot_model(estimator = xgboost, plot = 'feature')

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
plot_model(estimator = lightgbm, plot = 'feature')