# Import

In [2]:
import pandas as pd
import numpy as np
import joblib
import pickle

from sklearn.ensemble import IsolationForest
from pycaret import anomaly as py_an
from pycaret import classification as py_cl
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.pretraining import TabNetPretrainer
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.metrics import f1_score

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
RANDOM_STATE = 4158

# Def

In [3]:
class F1Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True
    
    def __call__(self, y_true, y_score):
        y_score = (y_score[:, 1] >= 0.5).astype(np.uint8)
        return f1_score(y_true, y_score)

In [4]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

# Data Load

In [4]:
public = pd.read_csv('../Data/master_public_data.csv',)
private = pd.read_csv('../Data/master_private_data.csv')
private_y = pd.read_csv('../Data/master_correct_answer_data.csv')['is_applied'].values
test = pd.read_csv('../Data/master_test_data.csv')

In [5]:
print(public.shape)
print(private.shape)
print(private_y.shape)
print(test.shape)

(924301, 72)
(2156703, 71)
(2156703,)
(3255194, 72)


In [6]:
public_x = public.drop(['is_applied'],axis=1)
public_y = public['is_applied'].values

test_x = test.drop(['is_applied'],axis=1)
test_y = test['is_applied'].values

# Model Load

## Anomaly

In [None]:
an_iforest_base = joblib.load('../Model/Isolationforest_base_1012.pkl')

In [None]:
an_isolation_pycaret = py_an.load_model('../Model/pycaret_iforest')

In [None]:
an_knn_pycaret = joblib.load('../Model/pycaret_knn.pkl')

In [None]:
an_mcd_pycaret = py_an.load_model('../Model/pycaret_mcd')

## ML

In [None]:
ml_lgbm = joblib.load('../Model/LGBM_Optuna_model_0.44_WithLog.pkl')

In [None]:
ml_cat = joblib.load('../Model/Cat_Optuna_model_0.48_withLog.pkl')

In [None]:
ml_et = py_cl.load_model('../Model/pycaret_et')

In [None]:
ml_rf = py_cl.load_model('../Model/pycaret_rf')

# Anomaly

In [None]:
an_iforest_base_public_pred = an_iforest_base.predict(public_x)
an_iforest_base_priate_pred = an_iforest_base.predict(private)
an_iforest_base_test_pred = an_iforest_base.predict(test_x)

In [None]:
an_isolation_pycaret_public_pred = py_an.predict_model(an_isolation_pycaret, data = public_x)
an_isolation_pycaret_private_pred = py_an.predict_model(an_isolation_pycaret, data = private)
an_isolation_pycaret_test_pred = py_an.predict_model(an_isolation_pycaret, data = test_x)

In [None]:
an_knn_pycaret_public_pred = py_an.predict_model(an_knn_pycaret, data = public_x)
an_knn_pycaret_private_pred = py_an.predict_model(an_knn_pycaret, data = private)
an_knn_pycaret_test_pred = py_an.predict_model(an_knn_pycaret, data = test_x)

In [None]:
an_mcd_pycaret_public_pred = py_an.predict_model(an_mcd_pycaret, data = public_x)
an_mcd_pycaret_private_pred = py_an.predict_model(an_mcd_pycaret, data = private)
an_mcd_pycaret_test_pred = py_an.predict_model(an_mcd_pycaret, data = test_x)

## ML

In [None]:
ml_lgbm_public_pred = ml_lgbm.predict(public_x)
ml_lgbm_private_pred = ml_lgbm.predict(private)
ml_lgbm_test_pred = ml_lgbm.predict(test_x)

In [None]:
ml_cat_public_pred = ml_cat.predict(public_x)
ml_cat_private_pred = ml_cat.predict(private)
ml_cat_test_pred = ml_cat.predict(test_x)

In [None]:
ml_et_public_pred = py_cl.predict_model(ml_et, data = public_x)
ml_et_private_pred = py_cl.predict_model(ml_et, data = private)
ml_et_test_pred = py_cl.predict_model(ml_et, data = test_x)

In [None]:
ml_rf_public_pred = py_cl.predict_model(ml_rf, data = public_x)
ml_rf_private_pred = py_cl.predict_model(ml_rf, data = private)
ml_rf_test_pred = py_cl.predict_model(ml_rf, data = test_x)

## Deployment CSV

### Anomaly

In [None]:
pd.DataFrame(an_iforest_base_public_pred).to_csv('../Data/an_isolation_public_pred.csv',index=False)
pd.DataFrame(an_iforest_base_priate_pred).to_csv('../Data/an_isolation_private.csv',index=False)
pd.DataFrame(an_iforest_base_test_pred).to_csv('../Data/an_isolation_test.csv',index=False)

In [None]:
pd.DataFrame(an_isolation_pycaret_public_pred).to_csv('../Data/an_isolation_pycaret_public_pred.csv',index=False)
pd.DataFrame(an_isolation_pycaret_private_pred).to_csv('../Data/an_isolation_pycaret_private_pred.csv',index=False)
pd.DataFrame(an_isolation_pycaret_test_pred).to_csv('../Data/an_isolation_pycaret_test_pred.csv',index=False)

In [None]:
pd.DataFrame(an_knn_pycaret_public_pred).to_csv('../Data/an_knn_pycaret_public_pred.csv',index=False)
pd.DataFrame(an_knn_pycaret_private_pred).to_csv('../Data/an_knn_pycaret_private_pred.csv',index=False)
pd.DataFrame(an_knn_pycaret_test_pred).to_csv('../Data/an_knn_pycaret_test_pred.csv',index=False)

In [None]:
an_mcd_pycaret_public_pred.to_csv('../Data/an_mcd_ptcaret_public_pred.csv',index=False)
an_mcd_pycaret_private_pred.to_csv('../Data/an_mcd_ptcaret_private_pred.csv',index=False)
an_mcd_pycaret_test_pred.to_csv('../Data/an_mcd_ptcaret_tset_pred.csv',index=False)

### ML

In [None]:
pd.DataFrame(ml_lgbm_public_pred).to_csv('../Data/ml_lgbm_public_pred.csv',index=False)
pd.DataFrame(ml_lgbm_private_pred).to_csv('../Data/ml_lgbm_private_pred.csv',index=False)
pd.DataFrame(ml_lgbm_test_pred).to_csv('../Data/ml_lgbm_test_pred.csv',index=False)

In [None]:
pd.DataFrame(ml_cat_public_pred).to_csv('../Data/ml_cat_public_pred_csv.',index=False)
pd.DataFrame(ml_cat_private_pred).to_csv('../Data/ml_cat_private_pred_csv.',index=False)
pd.DataFrame(ml_cat_test_pred).to_csv('../Data/ml_cat_test_pred_csv.',index=False)

In [None]:
ml_et_public_pred.to_csv('../Data/ml_et_public_pred.csv',index=False)
ml_et_priate_pred.to_csv('../Data/ml_et_private_pred.csv',index=False)
ml_et_test_pred.to_csv('../Data/ml_et_test_pred.csv',index=False)

In [None]:
ml_rf_public_pred.to_csv('../Data/ml_rf_public_pred.csv',index=False)
ml_rf_private_pred.to_csv('../Data/ml_rf_private_pred.csv',index=False)
ml_rf_test_pred.to_csv('../Data/ml_rf_test_pred.csv',index=False)

## Data Load

### Data

In [5]:
public_y = pd.read_csv('../Data/master_public_data.csv',usecols=['is_applied'])
private_y = pd.read_csv('../Data/master_correct_answer_data.csv')['is_applied'].values
test = pd.read_csv('../Data/master_test_data.csv')

### Anomaly

In [6]:
an_iforest_base_public_pred = pd.read_csv('../Data/an_isolation_public_pred.csv')
an_iforest_base_private_pred = pd.read_csv('../Data/an_isolation_private.csv')
an_iforest_base_test_pred = pd.read_csv('../Data/an_isolation_test.csv')

In [7]:
an_isolation_pycaret_public_pred = pd.read_csv('../Data/an_isolation_pycaret_public_pred.csv',usecols=['Anomaly'])
an_isolation_pycaret_private_pred = pd.read_csv('../Data/an_isolation_pycaret_private_pred.csv',usecols=['Anomaly'])
an_isolation_pycaret_test_pred = pd.read_csv('../Data/an_isolation_pycaret_test_pred.csv',usecols=['Anomaly'])

In [8]:
an_knn_pycaret_public_pred = pd.read_csv('../Data/an_knn_pycaret_public_pred.csv',usecols=['Anomaly'])
an_knn_pycaret_private_pred = pd.read_csv('../Data/an_knn_pycaret_private_pred.csv',usecols=['Anomaly'])
an_knn_pycaret_test_pred = pd.read_csv('../Data/an_knn_pycaret_test_pred.csv',usecols=['Anomaly'])

In [9]:
an_mcd_pycaret_public_pred = pd.read_csv('../Data/an_mcd_ptcaret_public_pred.csv',usecols=['Anomaly'])
an_mcd_pycaret_private_pred = pd.read_csv('../Data/an_mcd_ptcaret_private_pred.csv',usecols=['Anomaly'])
an_mcd_pycaret_test_pred = pd.read_csv('../Data/an_mcd_ptcaret_tset_pred.csv',usecols=['Anomaly'])

### ML

In [10]:
ml_lgbm_public_pred = pd.read_csv('../Data/ml_lgbm_public_pred.csv')
ml_lgbm_private_pred = pd.read_csv('../Data/ml_lgbm_private_pred.csv')
ml_lgbm_test_pred = pd.read_csv('../Data/ml_lgbm_test_pred.csv')

In [11]:
ml_cat_public_pred = pd.read_csv('../Data/ml_cat_public_pred_csv.')
ml_cat_private_pred = pd.read_csv('../Data/ml_cat_private_pred_csv.')
ml_cat_test_pred = pd.read_csv('../Data/ml_cat_test_pred_csv.')

In [12]:
ml_et_public_pred = pd.read_csv('../Data/ml_et_public_pred.csv',usecols=['Label'])
ml_et_priate_pred = pd.read_csv('../Data/ml_et_private_pred.csv',usecols=['Label'])
ml_et_test_pred = pd.read_csv('../Data/ml_et_test_pred.csv',usecols=['Label'])

In [13]:
ml_rf_public_pred = pd.read_csv('../Data/ml_rf_public_pred.csv',usecols=['Label'])
ml_rf_private_pred = pd.read_csv('../Data/ml_rf_private_pred.csv',usecols=['Label'])
ml_rf_test_pred = pd.read_csv('../Data/ml_rf_test_pred.csv',usecols=['Label'])

### Deep

In [14]:
dp_tabnet_public_pred = pd.read_csv('../Data/tabnet_public_predict.csv')
dp_tabnet_private_pred = pd.read_csv('../Data/tabnet_private_predict.csv')
dp_tabnet_test_pred = pd.read_csv('../Data/tabnet_test_predict.csv')

In [15]:
dp_sam_tabnet_public_pred = pd.read_csv('../Data/tabnet_public_predict_sam.csv')
dp_sam_tabnet_private_pred = pd.read_csv('../Data/tabnet_private_predict_sam.csv')
dp_sam_tabnet_test_pred = pd.read_csv('../Data/tabnet_test_predict_sam.csv')

# F1

## Anomaly

In [15]:
an_iforest_base_public_pred = get_pred_label(an_iforest_base_public_pred)
an_iforest_base_private_pred = get_pred_label(an_iforest_base_private_pred)

In [16]:
print(f1_score(an_iforest_base_public_pred,public_y))
print(f1_score(an_iforest_base_private_pred,private_y))

0.0
0.08205615021891398


In [17]:
print(f1_score(an_isolation_pycaret_public_pred,public_y))
print(f1_score(an_isolation_pycaret_private_pred,private_y))

0.11833081242359572
0.11731823810366178


In [18]:
print(f1_score(an_knn_pycaret_public_pred,public_y))
print(f1_score(an_knn_pycaret_private_pred,private_y))

0.0389665837073445
0.03778003347936033


In [19]:
print(f1_score(an_mcd_pycaret_public_pred,public_y))
print(f1_score(an_mcd_pycaret_private_pred,private_y))

0.08724184925044554
0.08656207641619583


## ML

In [20]:
print(f1_score(ml_lgbm_public_pred,public_y))
print(f1_score(ml_lgbm_private_pred,private_y))

0.44555071561916615
0.44385501183717563


In [21]:
print(f1_score(ml_cat_public_pred,public_y))
print(f1_score(ml_cat_private_pred,private_y))

0.47830739028089203
0.4792549631942895


In [22]:
print(f1_score(pd.to_numeric(ml_et_public_pred['Label']),public_y))
print(f1_score(pd.to_numeric(ml_et_priate_pred['Label']),private_y))

0.32455996821018673
0.32332320284055005


In [23]:
print(f1_score(pd.to_numeric(ml_rf_public_pred['Label']),public_y))
print(f1_score(pd.to_numeric(ml_rf_private_pred['Label']),private_y))

0.2997562296858072
0.2997724058045251


## Deep

In [24]:
print(f1_score(dp_sam_tabnet_public_pred,public_y))
print(f1_score(dp_sam_tabnet_private_pred,private_y))

0.3436995866844082
0.3423446197715747


In [25]:
print(f1_score(dp_tabnet_public_pred,public_y))
print(f1_score(dp_tabnet_private_pred,private_y))

0.3426375969998442
0.3430899055755012


# Ensemble

## Public

### Anomaly

In [26]:
an_public_pred = an_iforest_base_public_pred | an_isolation_pycaret_public_pred | an_knn_pycaret_public_pred | an_mcd_pycaret_public_pred

In [27]:
print(f1_score(an_public_pred,public_y))

0.12268745041951604


In [41]:
an_public_pred = an_iforest_base_public_pred | an_isolation_pycaret_public_pred  | an_mcd_pycaret_public_pred

In [42]:
print(f1_score(an_public_pred,public_y))

0.13688655263044508


### ML

In [28]:
ml_tree_public_pred = pd.to_numeric(ml_et_public_pred['Label']).astype(np.uint8) | pd.to_numeric(ml_rf_public_pred['Label']).astype(np.uint8)

In [29]:
print(f1_score(ml_tree_public_pred,public_y))

0.36087196647282416


In [30]:
ml_boost_public_pred = pd.DataFrame(ml_lgbm_public_pred).astype(np.uint8) | pd.DataFrame(ml_cat_public_pred).astype(np.uint8)

In [31]:
print(f1_score(ml_boost_public_pred,public_y))

0.47903536214963927


In [36]:
ml_public_pred = np.zeros(shape=ml_tree_public_pred.shape)
for i in range(len(ml_public_pred)):
    ml_public_pred[i] = (ml_tree_public_pred[i] | ml_boost_public_pred.values[i]).astype(np.uint8)

In [38]:
print(f1_score(ml_public_pred,public_y))

0.46400169480477155


### Deep

In [39]:
dp_public_pred = dp_sam_tabnet_public_pred | dp_tabnet_public_pred

In [40]:
print(f1_score(dp_public_pred,public_y))

0.3424786000758226


### ALL

In [65]:
an_isolation_pycaret_public_pred.values.flatten()

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [66]:
ml_cat_public_pred.values.flatten()

array([0., 0., 0., ..., 0., 0., 0.])

In [67]:
all_pred = an_isolation_pycaret_public_pred.values.flatten().astype(int) | ml_cat_public_pred.values.flatten().astype(int) | dp_sam_tabnet_public_pred.values.flatten().astype(int)

In [68]:
print(f1_score(all_pred,public_y))

0.3767535330805601


In [27]:
pred_ens = []
for a,b,c,d in zip(ml_lgbm_public_pred.values.flatten().astype(int),
                      ml_cat_public_pred.values.flatten().astype(int), 
                      dp_tabnet_public_pred.values.flatten().astype(int),
                      ml_et_public_pred['Label'].values.flatten().astype(int)):
    if (a+b+c+d) >= 2:
        pred_ens.append(1)
    else:
        pred_ens.append(0)

In [28]:
np.array(pred_ens)

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
print(f1_score(np.array(pred_ens),public_y))

0.4681613148887383


## Private

In [178]:
pred_ens = []
for a,b,c in zip(ml_lgbm_private_pred.values.flatten().astype(int),
                      ml_cat_private_pred.values.flatten().astype(int), 
                      dp_tabnet_private_pred.values.flatten().astype(int)):
    if (a+b+c) >= 2:
        pred_ens.append(1)
    else:
        pred_ens.append(0)

In [179]:
np.array(pred_ens)

array([0, 1, 0, ..., 0, 0, 0])

In [180]:
print(f1_score(np.array(pred_ens),private_y))

0.47940151109059104


## Test

In [181]:
pred_ens = []
for a,b,c in zip(ml_lgbm_test_pred.values.flatten().astype(int),
                      ml_cat_test_pred.values.flatten().astype(int), 
                      dp_tabnet_test_pred.values.flatten().astype(int)):
    if (a+b+c) >= 2:
        pred_ens.append(1)
    else:
        pred_ens.append(0)

In [182]:
np.array(pred_ens)

array([0, 1, 1, ..., 1, 1, 0])

In [185]:
len(pred_ens)

3255194

In [184]:
sum(pred_ens)

139929

## Deployment

In [188]:
submission = pd.read_csv('../Data/2022빅콘테스트_데이터분석리그_데이터분석분야_퓨처스부문_데이터셋_220908/데이터분석분야_퓨처스부문_평가데이터.csv')

In [189]:
submission

Unnamed: 0,application_id,product_id,is_applied
0,4,220,
1,4,191,
2,8,29,
3,8,159,
4,8,85,
...,...,...,...
3255189,2167778,258,
3255190,2167791,29,
3255191,2167822,149,
3255192,2167822,157,


In [190]:
submission['is_applied'] = pred_ens
submission

Unnamed: 0,application_id,product_id,is_applied
0,4,220,0
1,4,191,1
2,8,29,1
3,8,159,0
4,8,85,1
...,...,...,...
3255189,2167778,258,0
3255190,2167791,29,1
3255191,2167822,149,1
3255192,2167822,157,1


In [191]:
submission.to_csv('../../평가데이터/데이터분석분야_퓨처스부문_롱Loan팀_평가데이터.csv',index=False)