<a href="https://colab.research.google.com/github/ljs7463/AnalysisProject/blob/master/%EB%8D%B0%EC%9D%B4%EC%BD%98/analytics/xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# 시각화 폰트 설정
if os.name =='posix':
    plt.rc("font", family = "AppleGothic")

else:
    plt.rc("font", family = "Malgun Gothic")

# 경고문자 무시
warnings.filterwarnings(action='ignore')

In [2]:
# 코랩 실행시
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sub = pd.read_csv('sample_submission.csv')
df_info = pd.read_csv('data_info.csv')

In [3]:
## Label Encoding

## train데이터
df_train['preferred_difficulty_level'] = pd.factorize(df_train['preferred_difficulty_level'])[0]
df_train['subscription_type'] = pd.factorize(df_train['subscription_type'])[0]

## test데이터
df_test['preferred_difficulty_level'] = pd.factorize(df_test['preferred_difficulty_level'])[0]
df_test['subscription_type'] = pd.factorize(df_test['subscription_type'])[0]

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for i in [['subscription_duration','recent_login_time','average_time_per_learning_session','monthly_active_learning_days','total_completed_courses','recent_learning_achievement','abandoned_learning_sessions','community_engagement_level','customer_inquiry_history','payment_pattern']]:
  df_train[i] = scaler.fit_transform(df_train[i])
for i in [['subscription_duration','recent_login_time','average_time_per_learning_session','monthly_active_learning_days','total_completed_courses','recent_learning_achievement','abandoned_learning_sessions','community_engagement_level','customer_inquiry_history','payment_pattern']]:
  df_test[i] = scaler.transform(df_test[i])

In [5]:
# Delete user_id
df_train = df_train.drop(columns = 'user_id')

# split target
x = df_train[list(df_train.columns[:-1])]
y = df_train['target']

In [6]:
# Delete user_id
df_test = df_test.drop(columns = 'user_id')

# split target
new_x = df_test


In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, f1_score
from sklearn.datasets import load_iris
import pandas as pd

# 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y,random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train,random_state=42)
# param_grid = {
#     'max_depth': [3,4,5,6],
#     'learning_rate': [0.1, 0.01, 0.05],
#     'n_estimators': [100,200,300,400,500, 600],
#     'colsample_bytree': [0.3, 0.7, 0.9]
# }

# XGBoost 분류기 초기화
xgb = XGBClassifier()

# parameter grid
xgb_param_grid={
    'objective':['binary:logistic'],
    'n_estimators' : [600,700],
    'learning_rate' : [0.01,0.05,0.1],
    'max_depth' : [3,5,7,10],
    'gamma' : [0,1,2,3],
    'colsample_bytree' : [0.3,0.6,0.9],

}


# GridSearchCV 설정
grid_search = GridSearchCV( xgb ,
                            param_grid =xgb_param_grid,
                            cv=5,
                            scoring = "f1_macro",
                            n_jobs=-1,
                            verbose=1
                            )

# GridSearchCV 실행
grid_search.fit(x_train, y_train,early_stopping_rounds = 100,eval_metric='logloss', eval_set = [(x_val,y_val)])

# 결과 출력
print("최적의 파라미터:", grid_search.best_params_)
print("최고 평균 정확도:", grid_search.best_score_)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
[0]	validation_0-logloss:0.66375
[1]	validation_0-logloss:0.66128
[2]	validation_0-logloss:0.65999
[3]	validation_0-logloss:0.65817
[4]	validation_0-logloss:0.65859
[5]	validation_0-logloss:0.65847
[6]	validation_0-logloss:0.65856
[7]	validation_0-logloss:0.65890
[8]	validation_0-logloss:0.65889
[9]	validation_0-logloss:0.65870
[10]	validation_0-logloss:0.65773
[11]	validation_0-logloss:0.65615
[12]	validation_0-logloss:0.65613
[13]	validation_0-logloss:0.65615
[14]	validation_0-logloss:0.65674
[15]	validation_0-logloss:0.65685
[16]	validation_0-logloss:0.65644
[17]	validation_0-logloss:0.65583
[18]	validation_0-logloss:0.65588
[19]	validation_0-logloss:0.65585
[20]	validation_0-logloss:0.65575
[21]	validation_0-logloss:0.65503
[22]	validation_0-logloss:0.65513
[23]	validation_0-logloss:0.65450
[24]	validation_0-logloss:0.65403
[25]	validation_0-logloss:0.65386
[26]	validation_0-logloss:0.65372
[27]	validation_0-logloss:0.

In [8]:
grid_search.best_estimator_

In [9]:
# Save the best Model
best_model = grid_search.best_estimator_

# predictions
y_pred = best_model.predict(x_test)

# Estimate Macro F1 Scores
macro_f1 = f1_score(y_test, y_pred, average = 'macro')
print(f"Test SEt Macro F1 Score: {macro_f1}")

Test SEt Macro F1 Score: 0.4281248500905217


In [None]:
predictions = best_model.predict(new_x)

In [None]:
df_sub['target'] = predictions
df_sub.set_index('user_id').to_csv('xgboost.csv',encoding = 'cp949')