In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool

In [2]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = '머신러닝/channel_info_classification.dat'

# 교차검증 횟수
cv_count = 2

# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=1)

# 평가 결과를 담을 리스트
# 필요하다면 다른 것도 만들기
f1_score_list = []

# 학습 모델 이름
model_name_list = []

In [3]:
# 원본 데이터 불러오기
df = pd.read_parquet('마케팅정보_전처리.parquet')

# train/test 분리
is_test  = df['ID'].str.startswith('TEST_')
train_df = df[~is_test].reset_index(drop=True)
test_df  = df[ is_test].reset_index(drop=True)

# test ID 저장
test_ids = test_df['ID'].copy()

# 불필요 컬럼 제거 및 결측 처리
drop_cols = ['ID', '기준년월']
train_df  = train_df.drop(columns=drop_cols).fillna(-1)
test_df   = test_df.drop(columns=drop_cols).fillna(-1)

# 라벨 인코딩
train_df['Segment'] = train_df['Segment'].astype(str)
seg_le = LabelEncoder().fit(train_df['Segment'])
train_y = seg_le.transform(train_df['Segment'])
test_df = test_df.drop(columns=['Segment'], errors='ignore')

# 피처 목록
feature_cols = train_df.columns.drop('Segment')

# 입력 데이터 준비
train_X = train_df[feature_cols]
test_X  = test_df[feature_cols]

In [4]:
# GPU 사용 CatBoost 모델 초기화
model = CatBoostClassifier(
    task_type='GPU',
    devices='0',
    iterations=50000,
    learning_rate=0.1,
    depth=6,
    eval_metric='MultiClass',
    random_seed=42,
    verbose=1000
)

In [5]:
# 모델 학습
model.fit(train_X, train_y)

# 테스트 데이터 예측
test_preds_num = model.predict(test_X)
test_preds     = seg_le.inverse_transform(test_preds_num)

0:	learn: 1.3787414	total: 93.9ms	remaining: 1h 18m 14s
1000:	learn: 0.5329838	total: 14.9s	remaining: 12m 9s
2000:	learn: 0.5286998	total: 30.1s	remaining: 12m 2s
3000:	learn: 0.5255166	total: 44.8s	remaining: 11m 41s
4000:	learn: 0.5229099	total: 59.1s	remaining: 11m 19s
5000:	learn: 0.5206040	total: 1m 13s	remaining: 11m 3s
6000:	learn: 0.5185180	total: 1m 28s	remaining: 10m 49s
7000:	learn: 0.5166607	total: 1m 43s	remaining: 10m 34s
8000:	learn: 0.5148717	total: 1m 57s	remaining: 10m 17s
9000:	learn: 0.5132230	total: 2m 12s	remaining: 10m 5s
10000:	learn: 0.5117179	total: 2m 27s	remaining: 9m 51s
11000:	learn: 0.5102946	total: 2m 42s	remaining: 9m 37s
12000:	learn: 0.5089281	total: 2m 57s	remaining: 9m 22s
13000:	learn: 0.5076152	total: 3m 12s	remaining: 9m 7s
14000:	learn: 0.5063639	total: 3m 26s	remaining: 8m 51s
15000:	learn: 0.5051364	total: 3m 41s	remaining: 8m 36s
16000:	learn: 0.5039526	total: 3m 56s	remaining: 8m 22s
17000:	learn: 0.5027855	total: 4m 11s	remaining: 8m 7s
18

In [6]:
# submission 파일로 저장
submission = pd.DataFrame({
    'ID': test_ids,
    'Segment': test_preds
})
submission.to_csv('마케팅정보_catboost_predictions.csv', index=False, encoding='utf-8-sig')

In [7]:
# 절대값 feature importance 저장
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.get_feature_importance()
}).sort_values('importance', ascending=False)
importance_df.to_csv(
    '마케팅정보_catboost_feature_importances.csv',
    index=False,
    encoding='utf-8-sig'
)

In [8]:
# 상대값(퍼센트) importance 계산 및 출력
imp     = model.get_feature_importance()
rel_imp = imp / imp.sum()
df_imp  = pd.DataFrame({
    'feature':  feature_cols,
    'absolute': imp,
    'relative': rel_imp
}).sort_values('relative', ascending=False)
print(df_imp.head(10))

              feature   absolute  relative
21   컨택건수_이용유도_EM_R6M  14.648492  0.146485
19  컨택건수_이용유도_LMS_R6M  13.281751  0.132818
25  컨택건수_이용유도_인터넷_R6M  12.555450  0.125555
24  컨택건수_이용유도_청구서_R6M   9.411406  0.094114
11   컨택건수_이용유도_TM_R6M   8.751671  0.087517
5    컨택건수_이용유도_EM_B0M   6.144918  0.061449
15     컨택건수_보험_TM_R6M   6.138005  0.061380
9     컨택건수_카드론_TM_R6M   4.798919  0.047989
7   컨택건수_이용유도_청구서_B0M   3.084167  0.030842
29   캠페인접촉건수_R12M_num   2.925297  0.029253


### 보통 1%(=0.01) 이상이면 “모델에서 어느 정도 의미 있는 피처”로, 5%(=0.05) 이상이면 “꽤 중요한 피처