In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import classification_report, accuracy_score

In [2]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = '머신러닝/channel_info_classification.dat'

# 교차검증 횟수
cv_count = 2

# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=1)

# 평가 결과를 담을 리스트
# 필요하다면 다른 것도 만들기
f1_score_list = []

# 학습 모델 이름
model_name_list = []

In [3]:
# 데이터 로드
df = pd.read_parquet('마케팅정보_전처리.parquet')

# ID로 train과 test 분리
is_test  = df['ID'].str.startswith('TEST_')
train_df = df[~is_test].reset_index(drop=True)
test_df  = df[ is_test].reset_index(drop=True)

# ID 저장
test_ids = test_df['ID'].copy()

# 불필요한 컬럼 제거하고 결측값을 -1로 채우기
drop_cols = ['ID', '기준년월']
train_df = train_df.drop(columns=drop_cols).fillna(-1)
test_df  = test_df.drop(columns=drop_cols).fillna(-1)

# train 데이터의 Segment를 문자열로 통일하고 인코딩
train_df['Segment'] = train_df['Segment'].astype(str)
seg_le = LabelEncoder().fit(train_df['Segment'])
train_y = seg_le.transform(train_df['Segment'])

# test 데이터에서 Segment 컬럼 제거
test_df = test_df.drop(columns=['Segment'], errors='ignore')

# 모델 입력에 사용할 피처 목록 정의
feature_cols = train_df.columns.drop('Segment')

# train과 test의 입력 데이터 분리
train_X = train_df[feature_cols]
test_X  = test_df[feature_cols]

# 입력 데이터 표준화
scaler = StandardScaler().fit(train_X)
train_X = scaler.transform(train_X)
test_X  = scaler.transform(test_X)

### XGBoost

In [4]:
xgboost_basic_model = XGBClassifier(verbose=-1, silent=True, tree_method='gpu_hist')

# 모델 학습 수행
xgboost_basic_model.fit(train_X, train_y)

In [5]:
# 학습 데이터에 대한 예측 및 성능 확인
y_train_pred = xgboost_basic_model.predict(train_X)
print("Train Accuracy:", accuracy_score(train_y, y_train_pred))
print(classification_report(train_y, y_train_pred, target_names=seg_le.classes_))

# 테스트 데이터에 대한 예측
test_preds_num = xgboost_basic_model.predict(test_X)
test_preds = seg_le.inverse_transform(test_preds_num)

# 결과를 ID와 함께 데이터프레임으로 정리
submission = pd.DataFrame({
    'ID': test_ids,
    'Segment': test_preds
})

Train Accuracy: 0.80499125
              precision    recall  f1-score   support

           A       0.92      0.04      0.07       972
           B       1.00      0.17      0.29       144
           C       0.48      0.04      0.07    127590
           D       0.48      0.06      0.11    349242
           E       0.81      0.99      0.89   1922052

    accuracy                           0.80   2400000
   macro avg       0.74      0.26      0.29   2400000
weighted avg       0.75      0.80      0.74   2400000



In [6]:
# CSV로 저장
submission.to_csv('마케팅정보_xgboost_predictions.csv', index=False, encoding='utf-8-sig')

In [7]:
# feature importance 추출 및 저장
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgboost_basic_model.feature_importances_
}).sort_values('importance', ascending=False)
importance_df.to_csv('마케팅정보_XG_feature_importances.csv', index=False, encoding='utf-8-sig')

In [8]:
# 절대값 importance 배열
imp = xgboost_basic_model.feature_importances_

# 퍼센트로 변환
rel_imp = imp / imp.sum()

# DataFrame 생성
df_imp = pd.DataFrame({
    'feature': feature_cols,
    'absolute': imp,
    'relative': rel_imp
}).sort_values('relative', ascending=False)

# 확인
print(df_imp.head(10))

              feature  absolute  relative
11   컨택건수_이용유도_TM_R6M  0.163949  0.163949
7   컨택건수_이용유도_청구서_B0M  0.145161  0.145161
12   컨택건수_신용발급_TM_R6M  0.136814  0.136814
5    컨택건수_이용유도_EM_B0M  0.136090  0.136090
21   컨택건수_이용유도_EM_R6M  0.048114  0.048114
25  컨택건수_이용유도_인터넷_R6M  0.028737  0.028737
24  컨택건수_이용유도_청구서_R6M  0.028702  0.028702
15     컨택건수_보험_TM_R6M  0.028413  0.028413
14  컨택건수_포인트소진_TM_R6M  0.025042  0.025042
18   컨택건수_리볼빙_LMS_R6M  0.022482  0.022482


### 보통 1%(=0.01) 이상이면 “모델에서 어느 정도 의미 있는 피처”로, 5%(=0.05) 이상이면 “꽤 중요한 피처