In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score
from pycaret.classification import *

In [2]:
seed = 42
np.random.seed(seed)
set_config('seed', seed)

# 1. 데이터 로드

In [3]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [4]:
train['income']=pd.factorize(train['income'], sort=True)[0]

In [5]:
train.shape, test.shape

((26049, 15), (6512, 14))

In [6]:
train.head(3)
train.tail(3)
test.head(3)

Unnamed: 0_level_0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,1
1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,0
2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,0


Unnamed: 0_level_0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
26046,78,?,165694,Masters,14,Widowed,?,Not-in-family,White,Female,0,0,15,United-States,0
26047,26,Self-emp-not-inc,151626,HS-grad,9,Never-married,Prof-specialty,Own-child,Black,Female,0,0,40,United-States,0
26048,20,?,99891,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0


Unnamed: 0_level_0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,28,Private,67661,Some-college,10,Never-married,Adm-clerical,Other-relative,White,Female,0,0,40,United-States
1,40,Self-emp-inc,37869,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States
2,20,Private,109952,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,25,United-States


# 2. EDA & 전처리

In [7]:
train.info()
train.describe()
train.describe(include='O')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26049 entries, 0 to 26048
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             26049 non-null  int64 
 1   workclass       26049 non-null  object
 2   fnlwgt          26049 non-null  int64 
 3   education       26049 non-null  object
 4   education_num   26049 non-null  int64 
 5   marital_status  26049 non-null  object
 6   occupation      26049 non-null  object
 7   relationship    26049 non-null  object
 8   race            26049 non-null  object
 9   sex             26049 non-null  object
 10  capital_gain    26049 non-null  int64 
 11  capital_loss    26049 non-null  int64 
 12  hours_per_week  26049 non-null  int64 
 13  native_country  26049 non-null  object
 14  income          26049 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 3.2+ MB


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income
count,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0
mean,38.569235,190304.5,10.088372,1087.6897,87.732734,40.443126,0.242044
std,13.671489,105966.3,2.56761,7388.85469,403.230205,12.36185,0.428329
min,17.0,13769.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,118108.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178866.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237735.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country
count,26049,26049,26049,26049,26049,26049,26049,26049
unique,9,16,7,15,6,5,2,41
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
freq,18123,8433,11970,3304,10541,22315,17482,23371


## Missing Value

In [8]:
train.isnull().sum().sum()
test.isnull().sum().sum()

0

0

In [9]:
train_drop = train[~(train.native_country == '?')]
df_native_country = pd.DataFrame(train_drop.groupby(['race','education','income','native_country'])['native_country'].count())
df_native_country.columns = ['count']
df_native_country = df_native_country.reset_index()
df_native_country_max = pd.DataFrame(df_native_country.groupby(['race','education','income'])['count'].max()).reset_index()
df_native_country_max = df_native_country_max.merge(df_native_country, on=['race','education','income','count'], how='inner').groupby(['race','education','income'])['count','native_country'].max().reset_index().drop('count',axis=1)
train = train.merge(df_native_country_max, on=['race','education','income'], how='left')
train.loc[train.native_country_x == '?','native_country_x'] = train['native_country_y']
train = train.rename(columns={'native_country_x':'native_country'})
train = train.drop('native_country_y', axis=1)

In [None]:
test_drop = test[~(test.native_country == '?')]
df_native_country = pd.DataFrame(test_drop.groupby(['race','education','native_country'])['native_country'].count())
df_native_country.columns = ['count']
df_native_country = df_native_country.reset_index()
df_native_country_max = pd.DataFrame(df_native_country.groupby(['race','education'])['count'].max()).reset_index()
df_native_country_max = df_native_country_max.merge(df_native_country, on=['race','education','count'], how='inner').groupby(['race','education'])['count','native_country'].max().reset_index().drop('count',axis=1)
test = test.merge(df_native_country_max, on=['race','education'], how='left')
test.loc[test.native_country_x == '?','native_country_x'] = test['native_country_y']
test = test.rename(columns={'native_country_x':'native_country'})
test = test.drop('native_country_y', axis=1)
test

### outlier

### education

In [None]:
train.drop(['education'],axis=1, inplace=True)
test.drop(['education'],axis=1, inplace=True)

### capital_gain 99999

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
df['capital_gain_imsi'] = df['capital_gain'].replace(99999,0)

In [None]:
capital_gain_imsi = pd.DataFrame(df.groupby(['education_num'])['capital_gain_imsi'].mean()).reset_index()

In [None]:
train = train.merge(capital_gain_imsi, on=['education_num'], how='left')
test = test.merge(capital_gain_imsi, on=['education_num'], how='left')

In [None]:
train.loc[train.capital_gain == 99999, 'capital_gain'] = train['capital_gain_imsi']
test.loc[test.capital_gain == 99999, 'capital_gain'] = test['capital_gain_imsi']

In [None]:
train.drop(['capital_gain_imsi'],axis=1, inplace=True)
test.drop(['capital_gain_imsi'],axis=1, inplace=True)

# 3. 모델 학습

In [None]:
import gc
gc.collect()

In [None]:
train.columns

In [None]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'income'
          #, categorical_features=col_cat
          , numeric_features = ['education_num']
          #, ignore_features =tp.tolist() + wr.tolist() + wf.tolist()#'religion', 'race_1', 'race_2', 'race_3','race_4', 'race_5', 'race_6']
           )

In [None]:
best_3 = compare_models(sort = 'F1', n_select = 3
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

In [None]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

In [None]:
pred_holdout = predict_model(blended)

In [None]:
%%time
final_model = finalize_model(blended)

In [None]:
predictions = predict_model(final_model, data = test)

# 4. 예측

In [None]:
submission['prediction'] = predictions['Label']

# 5. 제출

In [None]:
submission.to_csv('output/20201017-1.csv')

In [None]:
xgboost = create_model('xgboost')

In [None]:
plot_model(estimator = xgboost, plot = 'feature')

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
plot_model(estimator = lightgbm, plot = 'feature')