<a href="https://colab.research.google.com/github/ljs7463/AnalysisProject/blob/master/%EB%8D%B0%EC%9D%B4%EC%BD%98/analytics/catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# 시각화 폰트 설정
if os.name =='posix':
    plt.rc("font", family = "AppleGothic")

else:
    plt.rc("font", family = "Malgun Gothic")

# 경고문자 무시
warnings.filterwarnings(action='ignore')

In [None]:
# 코랩 실행시
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sub = pd.read_csv('sample_submission.csv')
df_info = pd.read_csv('data_info.csv')

In [None]:
## Label Encoding

## train데이터
df_train['preferred_difficulty_level'] = pd.factorize(df_train['preferred_difficulty_level'])[0]
df_train['subscription_type'] = pd.factorize(df_train['subscription_type'])[0]

## test데이터
df_test['preferred_difficulty_level'] = pd.factorize(df_test['preferred_difficulty_level'])[0]
df_test['subscription_type'] = pd.factorize(df_test['subscription_type'])[0]

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for i in [['subscription_duration','recent_login_time','average_time_per_learning_session','monthly_active_learning_days','total_completed_courses','recent_learning_achievement','abandoned_learning_sessions','community_engagement_level','customer_inquiry_history','payment_pattern']]:
  df_train[i] = scaler.fit_transform(df_train[i])
for i in [['subscription_duration','recent_login_time','average_time_per_learning_session','monthly_active_learning_days','total_completed_courses','recent_learning_achievement','abandoned_learning_sessions','community_engagement_level','customer_inquiry_history','payment_pattern']]:
  df_test[i] = scaler.fit_transform(df_test[i])

In [None]:
# Delete user_id
df_train = df_train.drop(columns = 'user_id')

# split target
x = df_train[list(df_train.columns[:-1])]
y = df_train['target']

In [None]:
# Delete user_id
df_test = df_test.drop(columns = 'user_id')

# split target
new_x = df_test


# catboost(kfold5) + optuna

In [None]:
import optuna
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

def objective(trial,x,y):
    # Optuna가 탐색할 하이퍼파라미터의 범위 설정
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }

    # k-겹 교차 검증 설정
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # CatBoost 모델 생성 및 훈련
        model = CatBoostClassifier(**param, verbose=0)
        model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=100)

        # 예측 및 F1 점수 계산
        preds = model.predict(x_test)
        f1 = f1_score(y_test, preds, average='macro')
        f1_scores.append(f1)

    # 평균 F1 점수 반환
    return np.mean(f1_scores)

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
# lambda 함수를 사용하여 X와 y를 objective 함수에 전달
study.optimize(lambda trial: objective(trial, x, y), n_trials=100)

# 최적 하이퍼파라미터 출력
print('Best trial:', study.best_trial.params)

[I 2023-12-02 14:12:46,952] A new study created in memory with name: no-name-810f1d9a-afd7-4d18-ac35-65a461345b73
[I 2023-12-02 14:12:51,125] Trial 0 finished with value: 0.3855098136502278 and parameters: {'iterations': 481, 'depth': 4, 'learning_rate': 0.09583642733931293, 'random_strength': 82, 'bagging_temperature': 0.8020474058518604, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.3855098136502278.
[W 2023-12-02 14:12:51,399] Trial 1 failed with parameters: {'iterations': 862, 'depth': 5, 'learning_rate': 0.48394122347341056, 'random_strength': 51, 'bagging_temperature': 0.9764249087738047, 'od_type': 'Iter'} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-23-4c5dc55af614>", line 41, in <lambda>
    study.optimize(lambda trial: objective(trial, x, y), n_trials=100)
  File "<ipyt

KeyboardInterrupt: ignored

In [None]:
# Optuna 스터디에서 최적의 하이퍼파라미터 가져오기
best_params = study.best_trial.params

# 최적의 하이퍼파라미터를 사용하여 CatBoost 모델 초기화
final_model = CatBoostClassifier(**best_params, verbose=0)

# 전체 훈련 데이터셋을 사용하여 최종 모델 훈련
final_model.fit(x, y)

# 테스트 데이터셋에 대한 예측 수행
predictions = final_model.predict(new_x)

# 여기서 X_train, y_train, X_test는 실제 데이터셋을 가리킵니다.
# X_train, y_train은 모델 훈련에 사용되는 전체 훈련 데이터셋이고,
# X_test는 최종 예측을 수행할 테스트 데이터셋입니다.

In [None]:
# 예측값 저장
df_sub['target'] = predictions
df_sub.set_index('user_id').to_csv('csv.csv', encoding="cp949")

# catboost(층화추출 fold 10,) + optuna(TPE (Tree-structured Parzen Estimator) 알고리즘)

In [None]:
#################### 층화추출 ###################


import optuna
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def objective(trial,x,y):
    # Optuna가 탐색할 하이퍼파라미터의 범위 설정
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }

    # k-겹 교차 검증 설정
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    f1_scores = []

    for train_index, test_index in kf.split(x, y):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # CatBoost 모델 생성 및 훈련
        model = CatBoostClassifier(**param, verbose=0)
        model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=100)

        # 예측 및 F1 점수 계산
        preds = model.predict(x_test)
        f1 = f1_score(y_test, preds, average='macro')
        f1_scores.append(f1)

    # 평균 F1 점수 반환
    return np.mean(f1_scores)

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
# lambda 함수를 사용하여 X와 y를 objective 함수에 전달
study.optimize(lambda trial: objective(trial, x, y), n_trials=100)

# 최적 하이퍼파라미터 출력
print('Best trial:', study.best_trial.params)

[I 2023-12-02 14:14:23,840] A new study created in memory with name: no-name-6c178284-4ce9-467d-bd13-d093661c2bc7
[I 2023-12-02 14:14:31,588] Trial 0 finished with value: 0.4005931304346049 and parameters: {'iterations': 215, 'depth': 8, 'learning_rate': 0.3487332129584561, 'random_strength': 44, 'bagging_temperature': 0.36347184396557497, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.4005931304346049.
[I 2023-12-02 14:14:39,510] Trial 1 finished with value: 0.4360993733763482 and parameters: {'iterations': 123, 'depth': 7, 'learning_rate': 0.48305654563930106, 'random_strength': 53, 'bagging_temperature': 0.6769223532929799, 'od_type': 'Iter'}. Best is trial 1 with value: 0.4360993733763482.
[I 2023-12-02 14:14:48,385] Trial 2 finished with value: 0.38806786990905134 and parameters: {'iterations': 238, 'depth': 5, 'learning_rate': 0.0890594390678084, 'random_strength': 21, 'bagging_temperature': 0.788444755421977, 'od_type': 'Iter'}. Best is trial 1 with value: 0.4360993733763

Best trial: {'iterations': 175, 'depth': 8, 'learning_rate': 0.4682356736360952, 'random_strength': 52, 'bagging_temperature': 0.44476065767315814, 'od_type': 'Iter'}


In [None]:
# Optuna 스터디에서 최적의 하이퍼파라미터 가져오기
best_params = study.best_trial.params

# 최적의 하이퍼파라미터를 사용하여 CatBoost 모델 초기화
final_model = CatBoostClassifier(**best_params, verbose=0)

# 전체 훈련 데이터셋을 사용하여 최종 모델 훈련
final_model.fit(x, y)

# 테스트 데이터셋에 대한 예측 수행
predictions = final_model.predict(new_x)

# 여기서 X_train, y_train, X_test는 실제 데이터셋을 가리킵니다.
# X_train, y_train은 모델 훈련에 사용되는 전체 훈련 데이터셋이고,
# X_test는 최종 예측을 수행할 테스트 데이터셋입니다.

In [None]:
# 예측값 저장
df_sub['target'] = predictions
df_sub.set_index('user_id').to_csv('stratified_optuna(basic).csv', encoding="cp949")

# catboost(층화추출10fold)+ optuna(CMA-ES 샘플러)

In [None]:
pip install cmaes

Collecting cmaes
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Installing collected packages: cmaes
Successfully installed cmaes-0.10.0


In [None]:
#################### 층화추출 ###################


import optuna
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

def objective(trial,x,y):
    # Optuna가 탐색할 하이퍼파라미터의 범위 설정
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }

    # k-겹 교차 검증 설정
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # CatBoost 모델 생성 및 훈련
        model = CatBoostClassifier(**param, verbose=0)
        model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=100)

        # 예측 및 F1 점수 계산
        preds = model.predict(x_test)
        f1 = f1_score(y_test, preds, average='macro')
        f1_scores.append(f1)

    # 평균 F1 점수 반환
    return np.mean(f1_scores)
# CMA-ES 샘플러 사용
cmaes_sampler = optuna.samplers.CmaEsSampler(seed=42)

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(sampler=cmaes_sampler, direction='maximize')
# lambda 함수를 사용하여 X와 y를 objective 함수에 전달
study.optimize(lambda trial: objective(trial, x, y), n_trials=100)

# 최적 하이퍼파라미터 출력
print('Best trial:', study.best_trial.params)

[I 2023-12-02 14:33:36,786] A new study created in memory with name: no-name-bf97c83c-1f07-4a04-a1e6-61aa6fa49670
[I 2023-12-02 14:33:52,869] Trial 0 finished with value: 0.39800216880261363 and parameters: {'iterations': 437, 'depth': 10, 'learning_rate': 0.3686770314875885, 'random_strength': 60, 'bagging_temperature': 0.15601864044243652, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.39800216880261363.
[I 2023-12-02 14:34:02,649] Trial 1 finished with value: 0.3967084362960036 and parameters: {'iterations': 334, 'depth': 6, 'learning_rate': 0.19211251088953868, 'random_strength': 52, 'bagging_temperature': 0.4687109142192646, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.39800216880261363.
[I 2023-12-02 14:34:07,793] Trial 2 finished with value: 0.38748765397549945 and parameters: {'iterations': 349, 'depth': 6, 'learning_rate': 0.3450976760287955, 'random_strength': 55, 'bagging_temperature': 0.489521949523845, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.3

Best trial: {'iterations': 765, 'depth': 7, 'learning_rate': 0.4215442884400564, 'random_strength': 53, 'bagging_temperature': 0.3908827467071071, 'od_type': 'Iter'}


In [None]:
# Optuna 스터디에서 최적의 하이퍼파라미터 가져오기
best_params = study.best_trial.params

# 최적의 하이퍼파라미터를 사용하여 CatBoost 모델 초기화
final_model = CatBoostClassifier(**best_params, verbose=0)

# 전체 훈련 데이터셋을 사용하여 최종 모델 훈련
final_model.fit(x, y)

# 테스트 데이터셋에 대한 예측 수행
predictions = final_model.predict(new_x)

# 여기서 X_train, y_train, X_test는 실제 데이터셋을 가리킵니다.
# X_train, y_train은 모델 훈련에 사용되는 전체 훈련 데이터셋이고,
# X_test는 최종 예측을 수행할 테스트 데이터셋입니다.

In [None]:
# 예측값 저장
df_sub['target'] = predictions
df_sub.set_index('user_id').to_csv('stratified_optuna(cmaes).csv', encoding="cp949")