# 모델링

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import optuna

In [None]:
df.info()

In [None]:
y_df = df.loc[:,"판매량등급"]
x_df = df.drop(["총구매수","판매량등급"], axis = 1)
x_df.columns

In [None]:
X_train, X_val, y_train, y_val = train_test_split(x_df, y_df, test_size = 0.2, random_state= 1000, stratify = y_df)

In [None]:
kfold = KFold( n_splits = 5, random_state=1000, shuffle=True)

### RandomForest - 시도 1

In [None]:
rf = RandomForestClassifier(n_jobs=-1, random_state=1000 )
scores = cross_val_score(rf, X_train, y_train, cv = kfold, scoring="accuracy")
scores

In [None]:
def rf_objective(trial) :
    parameters = {
        "n_estimators" : trial.suggest_int("n_estimators", 50, 500),
        "criterion" : trial.suggest_categorical("criterion", ['gini', 'entropy', 'log_loss']),
        "max_depth" : trial.suggest_int("max_depth", 2, 50),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 10),
}
    rf = RandomForestClassifier(n_jobs = -1, random_state = 1000, **parameters)
    scores = cross_val_score(rf, X_train, y_train, cv = kfold, scoring = "accuracy")
    acc_mean = scores.mean()
    return acc_mean

In [None]:
rf_study = optuna.create_study( direction="maximize")
rf_study.optimize(rf_objective, n_trials=30, n_jobs=-1)
rf1_values = rf_study.best_trial.values
rf1_params = rf_study.best_params
print(rf1_values,rf1_params)

In [None]:
best_rf1 = RandomForestClassifier(n_jobs=-1, **rf1_params)
rf_kf_optuna_best = best_rf1.fit(X_train, y_train)
rf_kf_optuna_ypred = rf_kf_optuna_best.predict(X_val)
rf_kf_optuna_acc1 = accuracy_score(y_val, rf_kf_optuna_ypred)
print(rf1_values, rf_kf_optuna_acc1)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

feature_names = x_df.columns

importances = best_rf1.feature_importances_
indices = np.argsort(importances)[::-1]  # 중요성이 높은 순서대로 정렬

# 중요성이 높은 특성 순서대로 출력
print("Feature ranking:")
for f in range(x_df.shape[1]):
    print(f"{f+1}. {indices[f]+1}번째 피처 {feature_names[indices[f]]} ({importances[indices[f]]})")
# 중요성 시각화
plt.figure()
plt.title("Feature importances")
plt.bar(range(x_df.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(x_df.shape[1]), indices)
plt.xlim([-1, x_df.shape[1]])
plt.xlabel("Feature index")
plt.ylabel("Feature importance")
plt.show()

### RandomForest - 시도 2

In [None]:
y_df = df.loc[:,"판매량등급"]
x_df = df.drop(['최대_리뷰수', '최소_리뷰수', '중앙값_리뷰수', '최대_총조회수',
       '최소_총조회수', '중앙값_총조회수', '최대_총구매수', '최소_총구매수', '중앙값_총구매수',"총구매수","판매량등급"], axis = 1)
x_df.columns

In [None]:
X_train, X_val, y_train, y_val = train_test_split(x_df, y_df, test_size = 0.2, random_state= 1000, stratify = y_df)

In [None]:
def rf_objective(trial) :
    parameters = {
        "n_estimators" : trial.suggest_int("n_estimators", 50, 500),
        "criterion" : trial.suggest_categorical("criterion", ['gini', 'entropy', 'log_loss']),
        "max_depth" : trial.suggest_int("max_depth", 2, 50),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 10),
}
    rf = RandomForestClassifier(n_jobs = -1, random_state = 1000, **parameters)
    scores = cross_val_score(rf, X_train, y_train, cv = kfold, scoring = "accuracy")
    acc_mean = scores.mean()
    return acc_mean

In [None]:
rf_study = optuna.create_study( direction="maximize")
rf_study.optimize(rf_objective, n_trials=30, n_jobs=-1)
rf2_values = rf_study.best_trial.values
rf2_params = rf_study.best_params
print(rf2_values,rf2_params)

In [None]:
best_rf2 = RandomForestClassifier(n_jobs=-1, **rf2_params)
rf_kf_optuna_best = best_rf2.fit(X_train, y_train)
rf_kf_optuna_ypred = rf_kf_optuna_best.predict(X_val)
rf_kf_optuna_acc2 = accuracy_score(y_val, rf_kf_optuna_ypred)
print(rf2_values, rf_kf_optuna_acc2)

In [None]:
feature_names = x_df.columns

importances = best_rf2.feature_importances_
indices = np.argsort(importances)[::-1]  # 중요성이 높은 순서대로 정렬

# 중요성이 높은 특성 순서대로 출력
print("Feature ranking:")
for f in range(x_df.shape[1]):
    print(f"{f+1}. {indices[f]+1}번째 피처 {feature_names[indices[f]]} ({importances[indices[f]]})")
# 중요성 시각화
plt.figure()
plt.title("Feature importances")
plt.bar(range(x_df.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(x_df.shape[1]), indices)
plt.xlim([-1, x_df.shape[1]])
plt.xlabel("Feature index")
plt.ylabel("Feature importance")
plt.show()

### RandomForest - 시도 3

In [None]:
y_df = df.loc[:,"판매량등급"]
x_df = df.drop([ '최소_리뷰수', '최소_총조회수', '최소_총구매수',"총구매수","판매량등급"], axis = 1)
x_df.columns

In [None]:
X_train, X_val, y_train, y_val = train_test_split(x_df, y_df, test_size = 0.2, random_state= 1000, stratify = y_df)

In [None]:
def rf_objective(trial) :
    parameters = {
        "n_estimators" : trial.suggest_int("n_estimators", 50, 500),
        "criterion" : trial.suggest_categorical("criterion", ['gini', 'entropy', 'log_loss']),
        "max_depth" : trial.suggest_int("max_depth", 2, 50),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 10),
}
    rf = RandomForestClassifier(n_jobs = -1, random_state = 1000, **parameters)
    scores = cross_val_score(rf, X_train, y_train, cv = kfold, scoring = "accuracy")
    acc_mean = scores.mean()
    return acc_mean

In [None]:
rf_study = optuna.create_study( direction="maximize")
rf_study.optimize(rf_objective, n_trials=30, n_jobs=-1)
rf3_values = rf_study.best_trial.values
rf3_params = rf_study.best_params
print(rf3_values,rf3_params)

In [None]:
best_rf3 = RandomForestClassifier(n_jobs=-1, **rf3_params)
rf_kf_optuna_best = best_rf3.fit(X_train, y_train)
rf_kf_optuna_ypred = rf_kf_optuna_best.predict(X_val)
rf_kf_optuna_acc3 = accuracy_score(y_val, rf_kf_optuna_ypred)
print(rf3_values, rf_kf_optuna_acc3)

In [None]:
feature_names = x_df.columns

importances = best_rf3.feature_importances_
indices = np.argsort(importances)[::-1]  # 중요성이 높은 순서대로 정렬

# 중요성이 높은 특성 순서대로 출력
print("Feature ranking:")
for f in range(x_df.shape[1]):
    print(f"{f+1}. {indices[f]+1}번째 피처 {feature_names[indices[f]]} ({importances[indices[f]]})")
# 중요성 시각화
plt.figure()
plt.title("Feature importances")
plt.bar(range(x_df.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(x_df.shape[1]), indices)
plt.xlim([-1, x_df.shape[1]])
plt.xlabel("Feature index")
plt.ylabel("Feature importance")
plt.show()