# 라이브러리 설치 및 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install catboost

In [None]:
# !pip install optuna

In [None]:
# !pip install xgboost

In [None]:
# !pip install lightgbm

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import optuna
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression

# 데이터 로드 및 분할

In [None]:
# 최종 데이터: lending_club_clean_v3_2_N3.feather

data = pd.read_feather('/////lending_club_clean_v3_2_N3.feather')

In [None]:
columns = [
    'loan_amnt',
    'funded_amnt',
    'term',
    'int_rate',
    'installment',
    'annual_inc',
    'dti',
    'delinq_2yrs',
    'fico_range_high',
    'open_acc',
    'total_acc',
    'inq_fi',
    'total_cu_tl',
    'mo_sin_old_il_acct',
    'mo_sin_old_rev_tl_op',
    'mort_acc',
    'num_accts_ever_120_pd',
    'num_bc_sats',
    'num_bc_tl',
    'num_il_tl',
    'num_op_rev_tl',
    'num_rev_accts',
    'num_rev_tl_bal_gt_0',
    'num_sats',
    'num_tl_90g_dpd_24m',
    'num_tl_op_past_12m',
    'pct_tl_nvr_dlq',
    'percent_bc_gt_75',
    'tot_hi_cred_lim',
    'total_bal_ex_mort',
    'total_il_high_credit_limit',
    'pub_rec',
    'pub_rec_bankruptcies',
    'tax_liens',
    'collections_12_mths_ex_med',
    'chargeoff_within_12_mths',
    'new_total_rev_hi_lim',
    'new_revol_bal',
    'new_bc_open_to_buy',
    'new_total_bc_limit',
    'new_bc_util',
    'new_revol_util',
    'new_all_util',
    'new_state',
    'new_emp_length',
    'new_purpose',
    'ver_1',
    'ver_2',
    'home_1',
    'home_2',
    'home_3',
    'home_4',
    'home_5',
    'loan_status_N'
]

In [None]:
df = data[columns].copy()

# 'category' 타입 열을 선택
cat_cols = df.select_dtypes('category').columns.to_list()

# 각 'category' 열을 정수형으로 변환
for col in cat_cols:
    df[col] = df[col].cat.codes

In [None]:
# 피쳐, 타겟 분리
df_X = df.drop(columns='loan_status_N')
df_y = df['loan_status_N']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=1234, stratify=df_y)

# ((1719681, 60), (1719681,), (1375744, 60), (343937, 60), (1375744,), (343937,))
df_X.shape, df_y.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1719681, 60), (1719681,), (1375744, 60), (343937, 60), (1375744,), (343937,))

# 언더샘플링

In [None]:
smote_enn = SMOTEENN(random_state=42)
X_train_smote_enn, y_train_smote_enn = smote_enn.fit_resample(X_train, y_train)

In [None]:
cnn = CondensedNearestNeighbour(random_state=42)
X_train_cnn, y_train_cnn = cnn.fit_resample(X_train, y_train)

In [None]:
enn = EditedNearestNeighbours()
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

In [None]:
ncr = NeighbourhoodCleaningRule()
X_train_ncr, y_train_ncr = ncr.fit_resample(X_train, y_train)

# 선형 판별 분석 (LDA)

In [None]:
# LDA 모델 생성 및 학습
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# 학습 데이터에 대한 예측
lda_train_pred = lda.predict(X_train)

# 테스트 데이터에 대한 예측
lda_test_pred = lda.predict(X_test)

In [None]:
# 학습 데이터 성능 평가
lda_train_accuracy = accuracy_score(y_train, lda_train_pred)
lda_train_f1_score = f1_score(y_train, lda_train_pred)
lda_train_report = classification_report(y_train, lda_train_pred, target_names=['Class 0', 'Class 1'])

print(f"Train Accuracy: {lda_train_accuracy * 100:.2f}%")
print(f"Train f1_score: {lda_train_f1_score * 100:.2f}%")
print("Train Classification Report:")
print(lda_train_report)

# 테스트 데이터 성능 평가
lda_test_accuracy = accuracy_score(y_test, lda_test_pred)
lda_test_f1_score = f1_score(y_test, lda_test_pred)
lda_test_report = classification_report(y_test, lda_test_pred, target_names=['Class 0', 'Class 1'])

print(f"Test Accuracy: {lda_test_accuracy * 100:.2f}%")
print(f"Test f1_score: {lda_test_f1_score * 100:.2f}%")
print("Test Classification Report:")
print(lda_test_report)

In [None]:
# 지수표현법에서 소수점표기법으로 변환
pd.set_option('display.float_format', '{:.6f}'.format)

# 판별 함수의 계수 확인
lda_coef = lda.coef_[0]
features = df_X.columns

# 계수를 절대값 기준으로 정렬하고 랭크 부여
lda_importance = pd.DataFrame({
    'Feature': features,
    'lda_coef': lda_coef,
    'lda_abs_coef': abs(lda_coef)
})

lda_importance.sort_values(by='lda_abs_coef', ascending=False, inplace=True)
lda_importance['lda_rank'] = lda_importance['lda_abs_coef'].rank(method='min', ascending=False).astype(int)

result_lda = lda_importance[['Feature', 'lda_coef', 'lda_rank']].reset_index(drop = True)
result_lda

# CatBoost

In [None]:
# cat모델 학습
cat_model = CatBoostClassifier(random_state=42, iterations=500, learning_rate= 0.01)
cat_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
cat_test_pred = cat_model.predict(X_test)

# 학습 데이터에 대한 예측
cat_train_pred = cat_model.predict(X_train)

In [None]:
# 학습 데이터 성능 평가
cat_train_accuracy = accuracy_score(y_train, cat_train_pred)
cat_train_f1 = f1_score(y_train, cat_train_pred)
cat_test_report = classification_report(y_train, cat_train_pred, target_names=['Class 0', 'Class 1'])

print(f"Train Accuracy: {cat_train_accuracy * 100:.2f}%")
print(f"Train f1_score: {cat_train_f1 * 100:.2f}%")
print("Train Classification Report:")
print(cat_test_report)

# 테스트 데이터 성능 평가
cat_test_accuracy = accuracy_score(y_test, cat_test_pred)
cat_test_f1 = f1_score(y_test, cat_test_pred)
cat_test_report = classification_report(y_test, cat_test_pred, target_names=['Class 0', 'Class 1'])

print(f"Test Accuracy: {cat_test_accuracy * 100:.2f}%")
print(f"Test f1_score: {cat_test_f1 * 100:.2f}%")
print("Test Classification Report:")
print(cat_test_report)

In [None]:
# CatBoost 모델의 feature importance와 그에 따른 순위 부여
cat_importance = pd.DataFrame({
    'Feature': features,
    'cat_importance': cat_model.get_feature_importance()
})

cat_importance.sort_values(by='cat_importance', ascending=False, inplace=True)
cat_importance['cat_rank'] = cat_importance['cat_importance'].rank(method='min', ascending=False).astype(int)

result_cat = cat_importance[['Feature', 'cat_importance', 'cat_rank']].reset_index(drop=True)
result_cat

# Logistic Regression

In [None]:
# 로지스틱회귀 모델 학습
lr_model = LogisticRegression(max_iter=100, random_state=42)
lr_model.fit(X_train, y_train)

# 학습 데이터에 대한 예측
lr_train_pred = lr_model.predict(X_train)

# 테스트 데이터에 대한 예측
lr_test_pred = lr_model.predict(X_test)

In [None]:
# 학습 데이터 성능 평가
lr_train_accuracy = accuracy_score(y_train, lr_train_pred)
lr_train_f1 = f1_score(y_train, lr_train_pred)
lr_test_report = classification_report(y_train, lr_train_pred, target_names=['Class 0', 'Class 1'])

print(f"Train Accuracy: {lr_train_accuracy * 100:.2f}%")
print(f"Train f1_score: {lr_train_f1 * 100:.2f}%")
print("Train Classification Report:")
print(lr_test_report)

# 테스트 데이터 성능 평가
lr_test_accuracy = accuracy_score(y_test, lr_test_pred)
lr_test_f1 = f1_score(y_test, lr_test_pred)
lr_test_report = classification_report(y_test, lr_test_pred, target_names=['Class 0', 'Class 1'])

print(f"Test Accuracy: {lr_test_accuracy * 100:.2f}%")
print(f"Test f1_score: {lr_test_f1 * 100:.2f}%")
print("Test Classification Report:")
print(lr_test_report)

In [None]:
# 로지스틱 회귀 모델의 계수 확인
lr_coef = lr_model.coef_[0]

# 계수를 절대값 기준으로 정렬하고 랭크 부여
lr_importance = pd.DataFrame({
    'Feature': features,
    'lr_coef': lr_coef,
    'lr_abs_coef': abs(lr_coef)
})

lr_importance.sort_values(by='lr_abs_coef', ascending=False, inplace=True)
lr_importance['lr_rank'] = lr_importance['lr_abs_coef'].rank(method='min', ascending=False).astype(int)

result_lr = lr_importance[['Feature', 'lr_coef', 'lr_rank']].reset_index(drop=True)
result_lr

##LGBM

In [None]:
# lgb 모델 학습
lgb_model = LGBMClassifier(n_jobs=-1, n_estimators=100, learning_rate = 0.01)
lgb_model.fit(X_train, y_train)

# 학습 데이터에 대한 예측
lgb_train_pred = lgb_model.predict(X_train)

# 테스트 데이터에 대한 예측
lgb_test_pred = lgb_model.predict(X_test)

In [None]:
# 학습 데이터 성능 평가
lgb_train_accuracy = accuracy_score(y_train, lgb_train_pred)
lgb_train_f1 = f1_score(y_train, lgb_train_pred)
lgb_test_report = classification_report(y_train, lgb_train_pred, target_names=['Class 0', 'Class 1'])

print(f"Train Accuracy: {lgb_train_accuracy * 100:.2f}%")
print(f"Train f1_score: {lgb_train_f1 * 100:.2f}%")
print("Train Classification Report:")
print(lgb_test_report)

# 테스트 데이터 성능 평가
lgb_test_accuracy = accuracy_score(y_test, lgb_test_pred)
lgb_test_f1 = f1_score(y_test, lgb_test_pred)
lgb_test_report = classification_report(y_test, lgb_test_pred, target_names=['Class 0', 'Class 1'])

print(f"Test Accuracy: {lgb_test_accuracy * 100:.2f}%")
print(f"Test f1_score: {lgb_test_f1 * 100:.2f}%")
print("Test Classification Report:")
print(lgb_test_report)

In [None]:
# lgbm의 feature importance와 그에 따른 순위 부여
lgb_importance = pd.DataFrame({
    'Feature': features,
    'lgb_importance': lgb_model.feature_importances_
})

# 같은 점수에 같은 등급 부여 후 정수형으로 변환
lgb_importance.sort_values(by='lgb_importance', ascending=False, inplace=True)
lgb_importance['lgb_rank'] = lgb_importance['lgb_importance'].rank(method='min', ascending=False).astype(int)

result_lgb = lgb_importance[['Feature', 'lgb_importance', 'lgb_rank']].reset_index(drop = True)
result_lgb

# XGBoost

In [None]:
# xgb 모델 학습
xgb_model = XGBClassifier(n_jobs = -1, random_state = 1234)
xgb_model.fit(X_train, y_train)

# 학습 데이터에 대한 예측
xgb_train_pred = xgb_model.predict(X_train)

# 테스트 데이터에 대한 예측
xgb_test_pred = xgb_model.predict(X_test)

In [None]:
# 학습 데이터 성능 평가
xgb_train_accuracy = accuracy_score(y_train, xgb_train_pred)
xgb_train_f1 = f1_score(y_train, xgb_train_pred)
xgb_train_report = classification_report(y_train, xgb_train_pred, target_names=['Class 0', 'Class 1'])

print(f"Train Accuracy: {xgb_train_accuracy * 100:.2f}%")
print(f"Train f1_score: {xgb_train_f1 * 100:.2f}%")
print("Train Classification Report:")
print(xgb_train_report)

# 테스트 데이터 성능 평가
xgb_test_accuracy = accuracy_score(y_test, xgb_test_pred)
xgb_test_f1 = f1_score(y_test, xgb_test_pred)
xgb_test_report = classification_report(y_test, xgb_test_pred, target_names=['Class 0', 'Class 1'])

print(f"Test Accuracy: {xgb_test_accuracy * 100:.2f}%")
print(f"Test f1_score: {xgb_test_f1 * 100:.2f}%")
print("Test Classification Report:")
print(xgb_test_report)

In [None]:
# xgbm의 feature importance와 그에 따른 순위 부여
xgb_importance = pd.DataFrame({
    'Feature': features,
    'xgb_importance': xgb_model.feature_importances_
})

# 같은 점수에 같은 등급 부여 후 정수형으로 변환
xgb_importance.sort_values(by='xgb_importance', ascending=False, inplace=True)
xgb_importance['xgb_rank'] = xgb_importance['xgb_importance'].rank(method='min', ascending=False).astype(int)

result_xgb = xgb_importance[['Feature', 'xgb_importance', 'xgb_rank']].reset_index(drop = True)
result_xgb

# Random Forest

In [None]:
# RandomForest 모델 초기화 및 학습
rf_model = RandomForestClassifier(n_jobs=-1, random_state=1234)
rf_model.fit(X_train, y_train)

# 학습 데이터에 대한 예측
rf_train_pred = rf_model.predict(X_train)

# 테스트 데이터에 대한 예측
rf_test_pred = rf_model.predict(X_test)

In [None]:
# 학습 데이터 성능 평가
rf_train_accuracy = accuracy_score(y_train, rf_train_pred)
rf_train_f1 = f1_score(y_train, rf_train_pred, average='binary')
rf_train_report = classification_report(y_train, rf_train_pred, target_names=['Class 0', 'Class 1'])

print(f"Train Accuracy: {rf_train_accuracy * 100:.2f}%")
print(f"Train F1 Score: {rf_train_f1 * 100:.2f}%")
print("Train Classification Report:")
print(rf_train_report)

# 테스트 데이터 성능 평가
rf_test_accuracy = accuracy_score(y_test, rf_test_pred)
rf_test_f1 = f1_score(y_test, rf_test_pred, average='binary')
rf_test_report = classification_report(y_test, rf_test_pred, target_names=['Class 0', 'Class 1'])

print(f"Test Accuracy: {rf_test_accuracy * 100:.2f}%")
print(f"Test F1 Score: {rf_test_f1 * 100:.2f}%")
print("Test Classification Report:")
print(rf_test_report)

In [None]:
# RandomForest의 feature importance와 그에 따른 순위 부여
rf_importance = pd.DataFrame({
    'Feature': features,
    'rf_importance': rf_model.feature_importances_
})

# 같은 점수에 같은 등급 부여 후 정수형으로 변환
rf_importance.sort_values(by='rf_importance', ascending=False, inplace=True)
rf_importance['rf_rank'] = rf_importance['rf_importance'].rank(method='min', ascending=False).astype(int)

result_rf = rf_importance[['Feature', 'rf_importance', 'rf_rank']].reset_index(drop=True)
result_rf

# 피쳐 중요도 데이터프레임 병합

In [None]:
models = [result_cat, result_lr, result_lda, result_xgb, result_lgb, result_rf]

# 첫 번째 데이터프레임을 기준으로 초기화
feature_importance_df = models[0]

# 순차적으로 나머지 데이터프레임을 병합
for df in models[1:]:
    feature_importance_df = pd.merge(feature_importance_df, df, on='Feature', how='inner')

In [None]:
feature_importance_df

In [None]:
# rank를 포함하는 변수만 추출
rank_columns = ['Feature'] + [col for col in feature_importance_df.columns if 'rank' in col]

feature_rank_df = feature_importance_df[rank_columns]
feature_rank_df

In [None]:
# 피쳐중요도를 포함하는 변수만 추출
importance_columns = ['Feature'] + [col for col in feature_importance_df.columns if 'importance' in col or 'coef' in col]

importance_df = feature_importance_df[importance_columns]
importance_df