In [1]:
# Base
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

# Scoring
from sklearn.metrics import f1_score

# Model
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

# Employee

#### Data load & target setting

In [2]:
train_emp = pd.read_csv('data/employee_feature_train.csv').iloc[:,1:]
test_emp = pd.read_csv('data/employee_feature_test.csv').iloc[:,1:]
target_emp = train_emp.prime_yn

train_emp.drop(columns=['prime_yn'], inplace=True)
test_emp.drop(columns=['prime_yn'], inplace=True)

#### Categorycal Feature List Generate

In [3]:
cat_features_emp = train_emp.select_dtypes('object').columns.tolist()
num_features_emp = train_emp.select_dtypes('float').columns.tolist() + train_emp.select_dtypes('int').columns.tolist()

#### Train & Validation Data set Generate

In [4]:
X_train_emp, X_val_emp, y_train_emp, y_val_emp = train_test_split(train_emp, target_emp, test_size = 0.3, 
                                                                  random_state = 23, stratify = target_emp)

## CatBoost

#### Training

In [5]:
catb_emp = CatBoostClassifier(cat_features=cat_features_emp, random_state=0, silent=True)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
score = cross_val_score(catb_emp, X_train_emp, y_train_emp, scoring = 'f1', cv = skf, n_jobs = -1)
print(f'X_train_emp의 cross validation 점수 : {np.mean(score)}')

X_train_emp의 cross validation 점수 : 0.8784189354966443


#### Validation Score

In [6]:
catb_emp.fit(X_train_emp, y_train_emp)
catb_emp_pred = catb_emp.predict(X_val_emp)
print(f'X_val_emp 예측 점수 : {f1_score(y_val_emp, catb_emp_pred)}')

X_val_emp 예측 점수 : 0.8853735091023227


#### Retraining & Prediction

In [7]:
catb_emp_model = CatBoostClassifier(cat_features=cat_features_emp, random_state=0, silent=True)
catb_emp_model.fit(train_emp, target_emp)
catb_emp_pred = catb_emp_model.predict(test_emp)

catb_emp_pred_df = pd.DataFrame({'prime_yn' : catb_emp_pred}); catb_emp_pred_df

Unnamed: 0,prime_yn
0,1
1,1
2,1
3,1
4,1
...,...
8041,1
8042,0
8043,1
8044,0


## LGBM

#### Training

In [8]:
lgbm_emp = LGBMClassifier(random_state=0)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
score = cross_val_score(lgbm_emp, X_train_emp[num_features_emp], y_train_emp, scoring = 'f1', cv = skf, n_jobs = -1)
print(f'X_train_emp의 cross validation 점수 : {np.mean(score)}')

X_train_emp의 cross validation 점수 : 0.8733659787742164


#### Validation Score

In [9]:
lgbm_emp.fit(X_train_emp[num_features_emp], y_train_emp)
lgbm_emp_pred = lgbm_emp.predict(X_val_emp[num_features_emp])
print(f'X_val_emp 예측 점수 : {f1_score(y_val_emp, lgbm_emp_pred)}')

X_val_emp 예측 점수 : 0.8744929317762754


#### Retraining & Prediction

In [10]:
lgbm_emp_model = LGBMClassifier(random_state=0)
lgbm_emp_model.fit(train_emp[num_features_emp], target_emp)
lgbm_emp_pred = lgbm_emp_model.predict(test_emp[num_features_emp])

lgbm_emp_pred_df = pd.DataFrame({'prime_yn':lgbm_emp_pred}); lgbm_emp_pred_df

Unnamed: 0,prime_yn
0,1
1,1
2,1
3,1
4,1
...,...
8041,1
8042,0
8043,1
8044,1


## RandomForest

#### Training

In [11]:
rf_emp = RandomForestClassifier(random_state=0)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
score = cross_val_score(rf_emp, X_train_emp[num_features_emp], y_train_emp, scoring = 'f1', cv = skf, n_jobs = -1)
print(f'X_train_emp의 cross validation 점수 : {np.mean(score)}')

X_train_emp의 cross validation 점수 : 0.8445762328182577


#### Validation Score

In [12]:
rf_emp.fit(X_train_emp[num_features_emp], y_train_emp)
rf_emp_pred = rf_emp.predict(X_val_emp[num_features_emp])
print(f'X_val_emp 예측 점수 : {f1_score(y_val_emp, rf_emp_pred)}')

X_val_emp 예측 점수 : 0.865679264555669


#### Retraining & Prediction

In [13]:
rf_emp_model = RandomForestClassifier(random_state=0)
rf_emp_model.fit(train_emp[num_features_emp], target_emp)
rf_emp_pred = rf_emp_model.predict(test_emp[num_features_emp])

rf_emp_pred_df = pd.DataFrame({'prime_yn':rf_emp_pred}); rf_emp_pred_df

Unnamed: 0,prime_yn
0,1
1,1
2,1
3,1
4,1
...,...
8041,1
8042,0
8043,1
8044,1


# Non Employee

#### Data load & target setting

In [14]:
train_nemp = pd.read_csv('data/nemployee_feature_train.csv').iloc[:,1:]
test_nemp = pd.read_csv('data/nemployee_feature_test.csv').iloc[:,1:]
target_nemp = train_nemp.prime_yn

train_nemp.drop(columns=['prime_yn'], inplace=True)
test_nemp.drop(columns=['prime_yn'], inplace=True)

#### Categorycal Feature List Generate

In [15]:
cat_features_nemp = train_nemp.select_dtypes('object').columns.tolist()
num_features_nemp = train_nemp.select_dtypes('float').columns.tolist() + train_nemp.select_dtypes('int').columns.tolist()

#### Train & Validation Data set Generate

In [16]:
X_train_nemp, X_val_nemp, y_train_nemp, y_val_nemp = train_test_split(train_nemp, target_nemp, test_size = 0.3,
                                                                      random_state = 23, stratify = target_nemp)

## CatBoost

#### Training

In [17]:
catb_nemp = CatBoostClassifier(cat_features=cat_features_nemp, random_state=0, silent=True)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
score = cross_val_score(catb_nemp, X_train_nemp, y_train_nemp, scoring = 'f1', cv = skf, n_jobs = -1)
print(f'X_train_nemp의 cross validation 점수 : {np.mean(score)}')

X_train_nemp의 cross validation 점수 : 0.798589429935246


#### Validation Score

In [18]:
catb_nemp.fit(X_train_nemp, y_train_nemp)
catb_nemp_pred = catb_nemp.predict(X_val_nemp)
print(f'X_val_nemp 예측 점수 : {f1_score(y_val_nemp, catb_nemp_pred)}')

X_val_nemp 예측 점수 : 0.8186528497409327


In [19]:
catb_nemp_model = CatBoostClassifier(cat_features=cat_features_nemp, random_state=0, silent=True)
catb_nemp_model.fit(train_nemp, target_nemp)
catb_nemp_pred = catb_nemp_model.predict(test_nemp)

catb_nemp_pred_df = pd.DataFrame({'prime_yn' : catb_nemp_pred}); catb_nemp_pred_df

Unnamed: 0,prime_yn
0,1
1,1
2,0
3,1
4,0
...,...
11609,0
11610,0
11611,1
11612,0


## LGBM

#### Training

In [20]:
lgbm_nemp = LGBMClassifier(random_state=0)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
score = cross_val_score(lgbm_nemp, X_train_nemp[num_features_nemp], y_train_nemp, scoring = 'f1', cv = skf, n_jobs = -1)
print(f'X_train_nemp의 cross validation 점수 : {np.mean(score)}')

X_train_nemp의 cross validation 점수 : 0.7876092053908408


#### Validation Score

In [21]:
lgbm_nemp.fit(X_train_nemp[num_features_nemp], y_train_nemp)
lgbm_nemp_pred = lgbm_nemp.predict(X_val_nemp[num_features_nemp])
print(f'X_val_nemp 예측 점수 : {f1_score(y_val_nemp, lgbm_nemp_pred)}')

X_val_nemp 예측 점수 : 0.7944703061080547


#### Retraining & Prediction

In [22]:
lgbm_nemp_model = LGBMClassifier(random_state=0)
lgbm_nemp_model.fit(train_nemp[num_features_nemp], target_nemp)
lgbm_nemp_pred = lgbm_nemp_model.predict(test_nemp[num_features_nemp])

lgbm_nemp_pred_df = pd.DataFrame({'prime_yn':lgbm_nemp_pred}); lgbm_nemp_pred_df

Unnamed: 0,prime_yn
0,0
1,0
2,0
3,1
4,0
...,...
11609,0
11610,0
11611,0
11612,0


## RandomForest

#### Training

In [23]:
rf_nemp = RandomForestClassifier(random_state=0)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
score = cross_val_score(rf_nemp, X_train_nemp[num_features_nemp], y_train_nemp, scoring = 'f1', cv = skf, n_jobs = -1)
print(f'X_train_nemp의 cross validation 점수 : {np.mean(score)}')

X_train_nemp의 cross validation 점수 : 0.7596044574920742


#### Validation Score

In [24]:
rf_nemp.fit(X_train_nemp[num_features_nemp], y_train_nemp)
rf_nemp_pred = rf_nemp.predict(X_val_nemp[num_features_nemp])
print(f'X_val_nemp 예측 점수 : {f1_score(y_val_nemp, rf_nemp_pred)}')

X_val_nemp 예측 점수 : 0.7910030734842135


#### Retraining & Prediction

In [25]:
rf_nemp_model = RandomForestClassifier(random_state=0)
rf_nemp_model.fit(train_nemp[num_features_nemp], target_nemp)
rf_nemp_pred = rf_nemp_model.predict(test_nemp[num_features_nemp])

rf_nemp_pred_df = pd.DataFrame({'prime_yn':rf_nemp_pred}); rf_nemp_pred_df

Unnamed: 0,prime_yn
0,0
1,0
2,0
3,0
4,0
...,...
11609,1
11610,0
11611,1
11612,0


# Predict Concat

In [26]:
prime_yn = pd.concat([catb_emp_pred_df, catb_nemp_pred_df]).reset_index(drop=True)

In [27]:
test = pd.read_csv('data/tmk_bda_test.csv').iloc[:,:-1]

In [28]:
final_test = pd.concat([test, prime_yn],axis=1)
final_test.to_csv('submission/final_test.csv', index=False)