In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score
from pycaret.classification import *

In [2]:
seed = 42
np.random.seed(seed)
set_config('seed', seed)

# 1. 데이터 로드

In [3]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [4]:
train['income']=pd.factorize(train['income'], sort=True)[0]

In [None]:
train.shape, test.shape

In [None]:
train.head(3)
test.head(3)

# 2. EDA & 전처리

In [None]:
train.info()
train.describe()
train.describe(include='O')

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
train.info

### marital_status replace

In [5]:
train['marital_status'].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'],['Married','Married','Married'],inplace=True)
test['marital_status'].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'],['Married','Married','Married'],inplace=True)

## Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

## CovariateShift

### outlier

In [6]:
train.drop(['education'],axis=1, inplace=True)
test.drop(['education'],axis=1, inplace=True)

In [7]:
train.head(3)

Unnamed: 0_level_0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,40,Private,168538,9,Married,Sales,Husband,White,Male,0,0,60,United-States,1
1,17,Private,101626,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,0
2,18,Private,353358,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,0


# 3. 모델 학습

In [None]:
import gc
gc.collect()

In [None]:
train.columns

In [9]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'income'
          #, categorical_features=col_cat
          , numeric_features = ['education_num']
          #, ignore_features =tp.tolist() + wr.tolist() + wf.tolist()#'religion', 'race_1', 'race_2', 'race_3','race_4', 'race_5', 'race_6']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(26049, 14)"
4,Missing Values,False
5,Numeric Features,6
6,Categorical Features,7
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 11.1 s


In [10]:
best_3 = compare_models(sort = 'F1', n_select = 3
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.8666,0.9216,0.6478,0.7653,0.7014,0.6163,0.62,0.351
1,CatBoost Classifier,0.8674,0.9248,0.6371,0.7756,0.699,0.6151,0.6203,13.7276
2,Extreme Gradient Boosting,0.8623,0.9189,0.6429,0.7528,0.6933,0.6053,0.6086,2.8518
3,Gradient Boosting Classifier,0.8626,0.9188,0.6041,0.7788,0.6801,0.5944,0.6024,2.4871
4,Ada Boost Classifier,0.8554,0.9123,0.6151,0.7437,0.6729,0.5812,0.5857,0.7641
5,Extra Trees Classifier,0.8355,0.8829,0.6015,0.6822,0.6389,0.533,0.535,0.7705
6,Random Forest Classifier,0.8435,0.8777,0.5646,0.7288,0.6359,0.5383,0.5456,0.1261
7,Linear Discriminant Analysis,0.8373,0.8923,0.5746,0.7004,0.6307,0.5278,0.5324,0.2354


In [11]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8621,0.922,0.6327,0.7578,0.6896,0.6019,0.6059
1,0.8621,0.919,0.6505,0.7472,0.6955,0.6069,0.6094
2,0.8562,0.921,0.6246,0.7409,0.6778,0.5861,0.5897
3,0.8715,0.9321,0.6418,0.7873,0.7071,0.6259,0.6313
4,0.8711,0.9243,0.6602,0.7742,0.7127,0.6302,0.6336
Mean,0.8646,0.9237,0.642,0.7615,0.6965,0.6102,0.614
SD,0.0059,0.0045,0.0126,0.0171,0.0124,0.0162,0.0165


In [12]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8733,0.9277,0.6594,0.783,0.7159,0.6351,0.639


In [13]:
%%time
final_model = finalize_model(blended)

Wall time: 3min 22s


In [14]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [15]:
submission['prediction'] = predictions['Label']
submission

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
0,0
1,1
2,0
3,1
4,1
...,...
6507,0
6508,1
6509,0
6510,0


# 5. 제출

In [16]:
submission.to_csv('output/20201015-3.csv')

In [None]:
gbc = create_model('gbc')

In [None]:
plot_model(estimator = gbc, plot = 'feature')

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
plot_model(estimator = lightgbm, plot = 'feature')