In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import os

# 1. 데이터 불러오기
df = pd.read_csv("C:\\Users\\SAMSUNG\\Desktop\\dataScience\\movies_preprocessed_normalized.csv")

# 2. 문자열 변수 인코딩
categorical_cols = [
    "rating", "runtime_category", "director_top10", "writer_top10",
    "star_top30", "genre_top10", "country_top5", "company_top10"
]
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# 3. feature / label 분리
drop_cols = ["is_hit", "gross", "name", "cluster_label"]
X = df.drop(columns=drop_cols)
y = df["is_hit"]

# 4. 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Gradient Boosting 학습
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbc.fit(X_train, y_train)

# 6. 교차검증
cv_scores_gb = cross_val_score(gbc, X_train, y_train, cv=5, scoring="accuracy")
print(f"[Gradient Boosting] 5-Fold CV 평균 정확도: {cv_scores_gb.mean():.4f}")

# 7. 테스트셋 평가
y_pred_gb = gbc.predict(X_test)
report_gb = classification_report(y_test, y_pred_gb, digits=4)
print("[Gradient Boosting] 테스트셋 평가")
print(report_gb)

# 8. 결과 저장
os.makedirs("C:\\Users\\SAMSUNG\\Desktop\\dataScience\\result", exist_ok=True)
with open("C:\\Users\\SAMSUNG\\Desktop\\dataScience\\result\\gradient_boosting_report.txt", "w") as f:
    f.write(report_gb)

# 9. 모델 저장
os.makedirs("C:\\Users\\SAMSUNG\\Desktop\\dataScience\\model", exist_ok=True)
joblib.dump(gbc, "C:\\Users\\SAMSUNG\\Desktop\\dataScience\\model\\gradient_boosting_model.pkl")
print("모델 저장 완료")

[Gradient Boosting] 5-Fold CV 평균 정확도: 0.7491
[Gradient Boosting] 테스트셋 평가
              precision    recall  f1-score   support

           0     0.7439    0.7930    0.7677       575
           1     0.7495    0.6940    0.7206       513

    accuracy                         0.7463      1088
   macro avg     0.7467    0.7435    0.7442      1088
weighted avg     0.7465    0.7463    0.7455      1088

모델 저장 완료
