## Auto reload

In [1]:
%load_ext autoreload
%autoreload 2

## LIB

In [4]:
%reload_ext autoreload

# Basic lib
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt

# config
import yaml

# Custom lib
from src.loader import Loader
from src.loan_preprocessor import Loan_Preprocessor
from src.log_preprocessor import Log_Preprocessor
from src.user_preprocessor import User_Preprocessor
from src.cofix_preprocessor import Cofix_Preprocessor
from src.matcher import Matcher
from src.sampling import down_sampling, check_imbalanced_label, split_train_valid
from src.mice import mice
from src.bank import Bank_info
from src.clustering import Clustering, GowerDistance, KPrototype
from src.clustering_preprocessor import ClusteringPreprocessor
from src.statistic_check import TTEST,ANOVA
from sklearn.preprocessing import MinMaxScaler
from src.eda import EDA, EDAPreprocessing

# ignore warnings
pd.set_option('mode.chained_assignment',  None)
plt.style.use('ggplot')

## Load Config

In [8]:
with open("config.yaml", 'r') as config_file:
    config_dict = yaml.load(config_file, Loader=yaml.FullLoader)

## Load Data

In [9]:
loader_config = config_dict.get('Loader')

In [11]:
loader_config = config_dict.get('Loader')
loader = Loader(loader_config)
loan_df = loader.run('loan_result.csv')
log_df = loader.run('log_data.csv')
user_df = loader.run('user_spec.csv')
cofix_df = loader.run('cofix.csv')

## Preprocess

In [6]:
prep_config = config_dict.get('Preprocessor')

# Loan preprocess
loan_preprocessor = Loan_Preprocessor(loan_df, prep_config)
prep_loan_df = loan_preprocessor.run(
    save_file_name='prep_loan.fth',
    save_mode=True
    )

필요 없는 열 삭제 중...
datetime으로 바꾸는 중...
결측치를 가지는 행 삭제 중...
✅ prep dataset saved at (data/prep/prep_loan.fth)


In [7]:
# Log preprocess
log_preprocessor = Log_Preprocessor(log_df, prep_config)
prep_log_df = log_preprocessor.run(
    save_file_name='prep_log.fth',
    save_mode=True
)

필요 없는 열 삭제 중...
datetime으로 바꾸는 중...
카테고리화 시키는 중...
시간축을 기준으로 정렬 중...
✅ prep dataset saved at (data/prep/prep_log.fth)


In [8]:
# User preprocess
user_preprocessor = User_Preprocessor(user_df, prep_config)
prep_user_df = user_preprocessor.run(
    save_file_name='prep_user.fth',
    save_mode=True
)

필요 없는 열 삭제 중...
datetime으로 바꾸는 중...
파생변수 생성 중...
카테고리화 시키는 중...
원핫인코딩 중...
순서형인코딩 중...
결측치를 가지는 행 삭제 중...
✅ prep dataset saved at (data/prep/prep_user.fth)


In [9]:
# cofix preprocess
cofix_preprocessor = Cofix_Preprocessor(cofix_df, prep_config)
prep_cofix_df = cofix_preprocessor.run(
    save_file_name='prep_cofix.fth',
    save_mode=True
)

대상기간 열 나누는 중...
datetime으로 바꾸는 중...
시간축을 기준으로 정렬 중...
COFIX 금리 열 이름 변경 중...
✅ prep dataset saved at (data/prep/prep_cofix.fth)


## Matcher

In [None]:
matcher_config = config_dict.get('Matcher')

In [None]:
matcher = Matcher(prep_user_df, prep_loan_df, prep_log_df, prep_cofix_df, matcher_config)

In [None]:
train_valid_df, test_df = matcher.run(save_mode=True)

Loan과 Cofix 매칭중...
Loan_Cofix와 User를 merge중...
Train(Valid)과 Test로 나누는 중...
✅ matched dataset saved at (data/match/ml_train_valid.fth)
✅ matched dataset saved at (data/match/ml_test.fth)


## MICE

In [None]:
# train
mice_train = mice(train_valid_df)
mice_train_df = mice_train.mice_1()

In [None]:
# test
mice_test = mice(test_df)
mice_test_df = mice_test.mice_1()

In [None]:
print(mice_train_df.shape)
print(mice_test_df.shape)

(10264386, 53)
(3255482, 53)


## Bank

In [None]:
bank_info = Bank_info(mice_train_df)
final_train_df = bank_info.run()

## Resampling & Modeling

In [76]:
NUM_DOWN_SAMPLING = 30

for sampling_idx in range(NUM_DOWN_SAMPLING):
    
    print(f'✅ Sampling #{sampling_idx}')
    
    # Resampling & split
    downsampled_df = down_sampling(mice_train_df) # input을 mice_train_df 하면 될 것 같아요
    train_X, train_Y, valid_X, valid_Y = split_train_valid(downsampled_df)
    
    # TODO: Modeling
    from src.models.rf_model import RF
    random_state = 42
    rf = RF(train_X, train_Y, valid_X, valid_Y, random_state)
    best_model = rf.grid_search()
    # TODO : Model results storing (e.g. wandb, Dataframe...?)
    # TODO : Voting (soft or hard voting)
    
    # del train_X, train_Y, valid_X, valid_Y # Free memory for long iterations
    # break

✅ Sampling0
Down Sampling 중...
Train과 Valid로 나누는 중...


# 여기 이후로 불필요한 셀 정리 부탁드립니다

## Modeling

In [88]:
# base line
# model_name : RF, XGB, LGBM, 

## 모델 서치 ##
# from src.models import XGB
# random_state = 42
# xgb = XGB(train_X, train_Y, val_X, val_Y, test_X, test_Y, random_state)
# best_model = xgb.grid_search()

## test 결과 확인 및 모델 해석 ##
# xgb.test_score(best_model)
# xgb.confusion_matrix(best_model)
# xgb.feature_importance(best_model)
# xgb.shap(best_model)


In [5]:
from src.models.rf_model import RF

In [1]:
from src.models.lgbm_model import LGBM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.models.xgb_model import XGB

In [3]:
from src.models.catboost import CB

## Evaluation