In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool

In [2]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = '머신러닝/channel_info_classification.dat'

# 교차검증 횟수
cv_count = 2

# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=1)

# 평가 결과를 담을 리스트
# 필요하다면 다른 것도 만들기
f1_score_list = []

# 학습 모델 이름
model_name_list = []

In [3]:
# 원본 로드
df = pd.read_parquet('채널정보_결측치_제거.parquet')

# train/test 분리용 마스크
is_test  = df['ID'].str.startswith('TEST_')

# ID 보관
test_ids = df.loc[is_test, 'ID'].reset_index(drop=True)

# OS구분코드 인코더를 원본 df에서 train 부분만으로 학습
os_le = LabelEncoder().fit(df.loc[~is_test, 'OS구분코드'])

# train_df, test_df 생성
train_df = df.loc[~is_test].reset_index(drop=True)
test_df  = df.loc[ is_test].reset_index(drop=True)

# 인코딩된 OS구분코드 컬럼 추가
train_df['OS구분코드_enc'] = os_le.transform(train_df['OS구분코드'])
test_df ['OS구분코드_enc'] = os_le.transform(test_df ['OS구분코드'])

# 이제 불필요 컬럼(ID, 기준년월, 원본 OS구분코드) 제거
drop_cols = ['ID', '기준년월', 'OS구분코드']
train_df = train_df.drop(columns=drop_cols).fillna(-1)
test_df  = test_df .drop(columns=drop_cols).fillna(-1)

# train Segment 인코딩
train_df['Segment'] = train_df['Segment'].astype(str)
seg_le  = LabelEncoder().fit(train_df['Segment'])
train_y = seg_le.transform(train_df['Segment'])

# test_df 에서는 Segment 컬럼 삭제
test_df  = test_df.drop(columns=['Segment'], errors='ignore')

# 피처 목록
feature_cols = train_df.columns.drop('Segment')

# Pool 생성
train_pool = Pool(data=train_df[feature_cols], label=train_y, cat_features=['OS구분코드_enc'])
test_pool  = Pool(data=test_df [feature_cols], cat_features=['OS구분코드_enc'])

In [4]:
# GPU 사용 CatBoost 모델 초기화
model = CatBoostClassifier(
    task_type='GPU',
    devices='0',
    iterations=50000,
    learning_rate=0.1,
    depth=6,
    eval_metric='MultiClass',
    random_seed=42,
    verbose=1000
)

In [5]:
# 검증 없이 전체 train 으로 학습
model.fit(train_pool)

# test 예측 및 제출
test_preds_num = model.predict(test_pool)
test_preds     = seg_le.inverse_transform(test_preds_num)

0:	learn: 1.3779013	total: 31.2ms	remaining: 25m 58s
1000:	learn: 0.5388706	total: 22.3s	remaining: 18m 13s
2000:	learn: 0.5345767	total: 44.4s	remaining: 17m 44s
3000:	learn: 0.5312022	total: 1m 7s	remaining: 17m 42s
4000:	learn: 0.5283236	total: 1m 30s	remaining: 17m 19s
5000:	learn: 0.5257924	total: 1m 52s	remaining: 16m 54s
6000:	learn: 0.5235363	total: 2m 15s	remaining: 16m 31s
7000:	learn: 0.5213453	total: 2m 37s	remaining: 16m 9s
8000:	learn: 0.5193509	total: 3m	remaining: 15m 47s
9000:	learn: 0.5174795	total: 3m 23s	remaining: 15m 25s
10000:	learn: 0.5156623	total: 3m 45s	remaining: 15m 2s
11000:	learn: 0.5139556	total: 4m 8s	remaining: 14m 41s
12000:	learn: 0.5123312	total: 4m 33s	remaining: 14m 26s
13000:	learn: 0.5107490	total: 4m 58s	remaining: 14m 10s
14000:	learn: 0.5092328	total: 5m 22s	remaining: 13m 48s
15000:	learn: 0.5077607	total: 5m 45s	remaining: 13m 26s
16000:	learn: 0.5063426	total: 6m 8s	remaining: 13m 2s
17000:	learn: 0.5049954	total: 6m 32s	remaining: 12m 41s

In [6]:
# submission 파일로 저장
submission = pd.DataFrame({'ID': test_ids, 'Segment': test_preds})
submission.to_csv('채널정보_catboost_predictions.csv', index=False, encoding='utf-8-sig')

In [7]:
# 절대값 importance 저장
importance_df = pd.DataFrame({'feature': feature_cols, 'importance': model.get_feature_importance()}).sort_values('importance', ascending=False)

importance_df.to_csv('채널정보_catboost_feature_importances.csv', index=False, encoding='utf-8-sig')

In [8]:
# 상대값(퍼센트) importance 계산 및 확인
imp = model.get_feature_importance()

# 퍼센트로 변환
rel_imp = imp / imp.sum()

# DataFrame 생성 및 정렬
df_imp = pd.DataFrame({'feature':  feature_cols, 'absolute': imp, 'relative': rel_imp}).sort_values('relative', ascending=False)

# 상위 10개 피처 확인
print(df_imp.head(10))

           feature   absolute  relative
53      OS구분코드_enc  10.744826  0.107448
40   불만제기후경과월_R12M   6.764022  0.067640
8       방문일수_앱_R6M   6.558550  0.065585
44   홈페이지_금융건수_R6M   5.239893  0.052399
0     인입일수_ARS_R6M   4.700525  0.047005
45  홈페이지_선결제건수_R6M   4.590663  0.045907
29        상담건수_R6M   3.902486  0.039025
43  당사멤버쉽_방문월수_R6M   3.774053  0.037741
42  당사멤버쉽_방문횟수_R6M   3.679782  0.036798
6      방문월수_PC_R6M   3.494723  0.034947


### 보통 1%(=0.01) 이상이면 “모델에서 어느 정도 의미 있는 피처”로, 5%(=0.05) 이상이면 “꽤 중요한 피처