In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool

In [2]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = '머신러닝/channel_info_classification.dat'

# 교차검증 횟수
cv_count = 2

# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=1)

# 평가 결과를 담을 리스트
# 필요하다면 다른 것도 만들기
f1_score_list = []

# 학습 모델 이름
model_name_list = []

In [3]:
# 원본 데이터 불러기기
df = pd.read_parquet('승인매출정보_전처리.parquet')

# 날짜형 변환 & 원본 삭제
date_cols = [c for c in df.columns if '일자' in c]
for c in date_cols:
    # ".0" 제거 후 파싱
    s = df[c].astype(str).str.replace(r'\.0$', '', regex=True)
    df[f'{c}_dt'] = pd.to_datetime(s, format='%Y%m%d', errors='coerce')
df.drop(columns=date_cols, inplace=True)

# 이용금액대가 있으면 순서형 인코딩
if '이용금액대' in df.columns:
    levels = ["09.미사용","05.10만원-","02.50만원+","04.10만원+","03.30만원+","01.100만원+"]
    df['이용금액대_ord'] = df['이용금액대'].map({lvl:i+1 for i,lvl in enumerate(levels)})\
                                       .fillna(0).astype(int)
    df.drop(columns=['이용금액대'], inplace=True)

# 금융상환방식코드 전처리 (문자열 카테고리로 유지)
if '최종카드론_금융상환방식코드' in df.columns:
    s = df['최종카드론_금융상환방식코드'].astype(str).str.replace(r'\.0$', '', regex=True)
    df['최종카드론_금융상환방식코드'] = s.replace({'nan': None}).astype('string').fillna('Unknown')

# 업종 컬럼(문자열) 목록 필터링
categorical_cols = [
    '_1순위업종','_2순위업종','_3순위업종',
    '_1순위쇼핑업종','_2순위쇼핑업종','_3순위쇼핑업종',
    '_1순위교통업종','_2순위교통업종','_3순위교통업종',
    '_1순위여유업종','_2순위여유업종','_3순위여유업종',
    '_1순위납부업종','_2순위납부업종','_3순위납부업종'
]
categorical_cols = [c for c in categorical_cols if c in df.columns]

# train/test 나누기
is_test = df['ID'].str.startswith('TEST_')
train_df = df.loc[~is_test].reset_index(drop=True)
test_df  = df.loc[ is_test].reset_index(drop=True)

# submission용 ID를 먼저 저장
test_ids = df.loc[is_test, 'ID'].reset_index(drop=True)

# LabelEncoder 로 Segment 타겟 변환
train_df['Segment'] = train_df['Segment'].astype(str)
seg_le = LabelEncoder().fit(train_df['Segment'])
train_y = seg_le.transform(train_df['Segment'])
test_df.drop(columns=['Segment'], errors='ignore', inplace=True)

# 불필요 컬럼 제거 & 결측 -1
drop_cols = ['ID','기준년월']
train_df = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns]).fillna(-1)
test_df  = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns]).fillna(-1)

# feature 리스트
feature_cols = [c for c in train_df.columns if c != 'Segment']

# cat_features 자동 탐색: object/string 타입 컬럼만
cat_features = [
    c for c in feature_cols
    if train_df[c].dtype in ('object','string','category')
]

# 숫자형 컬럼 강제 변환 (나머지)
num_cols = [c for c in feature_cols if c not in cat_features]
train_df[num_cols] = train_df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(-1)
test_df[num_cols]  = test_df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(-1)

# Pool 생성
train_pool = Pool(
    data=train_df[feature_cols],
    label=train_y,
    cat_features=cat_features
)
test_pool = Pool(
    data=test_df[feature_cols],
    cat_features=cat_features
)

In [4]:
# GPU 사용 CatBoost 모델 초기화
model = CatBoostClassifier(
    task_type='GPU',
    devices='0',
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric='MultiClass',
    random_seed=42,
    verbose=1000
)

In [5]:
# 모델 학습
model.fit(train_pool)

# 테스트 데이터 예측 (Pool 그대로 사용)
test_preds_num = model.predict(test_pool)
test_preds     = seg_le.inverse_transform(test_preds_num)

0:	learn: 1.3251182	total: 395ms	remaining: 6m 34s
999:	learn: 0.2387353	total: 48.2s	remaining: 0us


In [6]:
# submission 파일로 저장
submission = pd.DataFrame({
    'ID': test_ids,
    'Segment': test_preds
})
submission.to_csv(
    '마케팅정보_catboost_predictions.csv',
    index=False,
    encoding='utf-8-sig'
)

In [7]:
# 절대값 importance 저장
importance_df = pd.DataFrame({'feature': feature_cols, 'importance': model.get_feature_importance()}).sort_values('importance', ascending=False)

importance_df.to_csv('채널정보_catboost_feature_importances.csv', index=False, encoding='utf-8-sig')

In [8]:
# 상대값(퍼센트) importance 계산 및 출력
imp     = importance_df['importance'].values
rel_imp = imp / imp.sum()
df_imp  = pd.DataFrame({'feature':  importance_df['feature'], 'absolute': imp,'relative': rel_imp}).sort_values('relative', ascending=False)
print(df_imp.head(10))

                feature  absolute  relative
335  연속유실적개월수_기본_24M_카드  8.250640  0.082506
33        이용금액_일시불_R12M  8.114989  0.081150
326          정상청구원금_B5M  7.007788  0.070078
39         이용금액_체크_R12M  5.211414  0.052114
243       이용금액_오프라인_B0M  4.406548  0.044065
329          연체입금원금_B5M  2.869833  0.028698
47       최대이용금액_체크_R12M  2.650339  0.026503
24         이용건수_신용_R12M  2.336358  0.023364
41      최대이용금액_일시불_R12M  2.318730  0.023187
46       최대이용금액_CA_R12M  2.084017  0.020840


### 보통 1%(=0.01) 이상이면 “모델에서 어느 정도 의미 있는 피처”로, 5%(=0.05) 이상이면 “꽤 중요한 피처