In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import classification_report, accuracy_score

In [2]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = '머신러닝/channel_info_classification.dat'

# 교차검증 횟수
cv_count = 2

# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=1)

# 평가 결과를 담을 리스트
# 필요하다면 다른 것도 만들기
f1_score_list = []

# 학습 모델 이름
model_name_list = []

In [3]:
# 데이터 로드
df = pd.read_parquet('승인매출정보_전처리.parquet')

# 기준년월 datetime 변환
df['기준년월'] = pd.to_datetime(df['기준년월'], format='%Y%m')

# 12월 데이터만 필터
df = df[df['기준년월'].dt.month == 12].reset_index(drop=True)

# train/test 분리
train_df = df[~df['ID'].str.startswith('TEST_')].reset_index(drop=True)
test_df  = df[df['ID'].str.startswith('TEST_')].reset_index(drop=True)

In [4]:
# 날짜형 변환 및 원본 컬럼 삭제
date_cols = [c for c in df.columns if '일자' in c]
for c in date_cols:
    s = df[c].astype(str).str.replace(r'\.0$', '', regex=True)
    df[f'{c}_dt'] = pd.to_datetime(s, format='%Y%m%d', errors='coerce')
# 원본 날짜 컬럼 삭제
df.drop(columns=date_cols, inplace=True)

# 이용금액대 순서형 인코딩
if '이용금액대' in df.columns:
    levels = ["09.미사용","05.10만원-","02.50만원+",
              "04.10만원+","03.30만원+","01.100만원+"]
    df['이용금액대_ord'] = (
        df['이용금액대']
        .map({lvl: i+1 for i, lvl in enumerate(levels)})
        .fillna(0)
        .astype(int)
    )
    df.drop(columns=['이용금액대'], inplace=True)

# 금융상환방식코드 문자열 전처리
if '최종카드론_금융상환방식코드' in df.columns:
    s = df['최종카드론_금융상환방식코드'].astype(str).str.replace(r'\.0$', '', regex=True)
    df['최종카론_상환방식'] = (
        s.replace({'nan': None})
         .astype('string')
         .fillna('Unknown')
    )
    df.drop(columns=['최종카드론_금융상환방식코드'], inplace=True)

# train/test 분리 및 ID 저장
is_test = df['ID'].str.startswith('TEST_')
train_df = df.loc[~is_test].reset_index(drop=True)
test_df  = df.loc[is_test].reset_index(drop=True)
test_ids = test_df['ID'].copy()

# 불필요 컬럼 제거 및 결측값 채우기
drop_cols = ['ID', '기준년월']
train_df.drop(columns=[c for c in drop_cols if c in train_df.columns], inplace=True)
test_df.drop(columns=[c for c in drop_cols if c in test_df.columns], inplace=True)
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

# Segment 라벨 인코딩
train_df['Segment'] = train_df['Segment'].astype(str)
seg_le = LabelEncoder().fit(train_df['Segment'])
train_y = seg_le.transform(train_df['Segment'])
test_df.drop(columns=['Segment'], errors='ignore', inplace=True)

# feature 목록 및 타입 분리
feature_cols = [c for c in train_df.columns if c != 'Segment']
cat_cols = [c for c in feature_cols if train_df[c].dtype in ('object','string','category')]
num_cols = [c for c in feature_cols if c not in cat_cols]

# 카테고리형 컬럼 레이블 인코딩
for c in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[c].astype(str), test_df[c].astype(str)], axis=0)
    le.fit(combined)
    train_df[c] = le.transform(train_df[c].astype(str))
    test_df[c]  = le.transform(test_df[c].astype(str))

# 숫자형 안전 변환
train_df[num_cols] = train_df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(-1)
test_df[num_cols]  = test_df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(-1)

# 표준화
scaler = StandardScaler().fit(train_df[feature_cols])
train_X = scaler.transform(train_df[feature_cols])
test_X  = scaler.transform(test_df[feature_cols])

### XGBoost

In [5]:
xgboost_basic_model = XGBClassifier(verbose=-1, silent=True, tree_method='gpu_hist', predictor = 'gpu_predictor')

# 모델 학습 수행
xgboost_basic_model.fit(train_X, train_y)

In [6]:
# 학습 데이터에 대한 예측 및 성능 확인
y_train_pred = xgboost_basic_model.predict(train_X)
print("Train Accuracy:", accuracy_score(train_y, y_train_pred))
print(classification_report(train_y, y_train_pred, target_names=seg_le.classes_))

# 테스트 데이터에 대한 예측
test_preds_num = xgboost_basic_model.predict(test_X)
test_preds = seg_le.inverse_transform(test_preds_num)

# 결과를 ID와 함께 데이터프레임으로 정리
submission = pd.DataFrame({
    'ID': test_ids,
    'Segment': test_preds
})

Train Accuracy: 0.919775
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       162
           B       1.00      1.00      1.00        24
           C       0.82      0.67      0.74     21265
           D       0.77      0.71      0.74     58207
           E       0.95      0.97      0.96    320342

    accuracy                           0.92    400000
   macro avg       0.91      0.87      0.89    400000
weighted avg       0.92      0.92      0.92    400000



In [7]:
# feature importance 추출 및 저장
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgboost_basic_model.feature_importances_
}).sort_values('importance', ascending=False)
importance_df.to_csv('승인매출정보_XG_feature_importances.csv', index=False, encoding='utf-8-sig')

In [8]:
# 절대값 importance 배열
imp = xgboost_basic_model.feature_importances_

# 퍼센트로 변환
rel_imp = imp / imp.sum()

# DataFrame 생성
df_imp = pd.DataFrame({
    'feature': feature_cols,
    'absolute': imp,
    'relative': rel_imp
}).sort_values('relative', ascending=False)

# 확인
print(df_imp.head(10))

                feature  absolute  relative
238       이용금액_오프라인_R3M  0.122499  0.122499
343           이용금액대_ord  0.119470  0.119470
321          정상청구원금_B2M  0.048421  0.048421
334  연속유실적개월수_기본_24M_카드  0.033976  0.033976
47       최대이용금액_체크_R12M  0.030861  0.030861
317          정상청구원금_B0M  0.030712  0.030712
46       최대이용금액_CA_R12M  0.015668  0.015668
75          이용금액_체크_R6M  0.015255  0.015255
242       이용금액_오프라인_B0M  0.014659  0.014659
234       이용금액_오프라인_R6M  0.014360  0.014360


In [9]:
# 상대 중요도가 0.01 이상인 특성 목록 생성
important_features = df_imp.loc[df_imp['relative'] >= 0.01, 'feature'].tolist()

# 저장할 열 목록 설정 (ID, Segment, 중요 특성)
save_columns = ['ID', 'Segment'] + important_features

# 원본 DataFrame에서 선택된 열로 구성된 DataFrame 생성
df_selected = df[save_columns].copy()
df_selected

Unnamed: 0,ID,Segment,이용금액_오프라인_R3M,이용금액대_ord,정상청구원금_B2M,연속유실적개월수_기본_24M_카드,최대이용금액_체크_R12M,정상청구원금_B0M,최대이용금액_CA_R12M,이용금액_체크_R6M,이용금액_오프라인_B0M,이용금액_오프라인_R6M,이용금액_체크_R12M,이용개월수_일시불_R6M,이용건수_신용_R12M,이용금액_체크_R3M,이용개월수_신용_R12M,이용금액_일시불_R12M
0,TRAIN_000000,D,11756,6,15251,17,998,15067,12264,0,3931,23609,7824,6,147,0,12,24782
1,TRAIN_000001,E,12128,5,2776,17,0,2222,3516,0,4033,24246,-414,6,177,0,12,53959
2,TRAIN_000002,C,24370,6,23325,8,0,26184,69186,0,10536,43371,-414,6,149,0,9,60220
3,TRAIN_000003,D,12529,6,18808,24,0,20959,9802,0,3940,24783,-414,6,107,0,12,16649
4,TRAIN_000004,E,0,2,0,0,3910,639,0,12638,0,0,12988,1,-1,6990,2,-861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,TEST_99995,,0,1,0,0,0,0,0,0,0,0,1168,0,0,0,0,0
499996,TEST_99996,,1178,2,3736,8,0,205,0,0,407,2331,-414,6,7,0,10,2631
499997,TEST_99997,,0,1,186,0,0,0,0,0,0,0,1394,0,0,0,0,0
499998,TEST_99998,,44309,6,23261,24,41564,26308,0,62481,15388,89973,81487,6,933,33564,12,423882


In [10]:
# train과 test 모두 포함된 선택된 데이터 CSV 파일로 저장
df_selected.to_csv('승인매출정보_최종.csv', index=False, encoding='utf-8-sig')

### 보통 1%(=0.01) 이상이면 “모델에서 어느 정도 의미 있는 피처”로, 5%(=0.05) 이상이면 “꽤 중요한 피처