In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, roc_curve, roc_auc_score
import numpy as np

# -----------------------------
# 1) 데이터 준비
# -----------------------------
df = pd.read_csv("/Users/minseo/Documents/Streamlit/실전 프로젝트/반도체_결함_데이터_한글.csv")  # 실제 데이터 불러오기
X = df.drop(columns=['불량여부', '결함유형'])
y = df['불량여부']

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 2) 모델 학습 (GridSearch)
# -----------------------------
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')

grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='recall', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# -----------------------------
# 3) 예측값 & 확률 저장
# -----------------------------
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]

# -----------------------------
# 4) Threshold별 지표 저장
# -----------------------------
thresholds = np.arange(0.0, 1.05, 0.05)
metrics_list = []

for t in thresholds:
    preds_t = (y_pred_proba >= t).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds_t, average='binary')
    metrics_list.append({
        "Threshold": round(t, 2),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1": round(f1, 4)
    })

metrics_df = pd.DataFrame(metrics_list)

# -----------------------------
# 5) ROC Curve 데이터 저장
# -----------------------------
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_df = pd.DataFrame({
    "FPR": fpr,
    "TPR": tpr
})
roc_auc = roc_auc_score(y_test, y_pred_proba)

# -----------------------------
# 6) Feature Importance 저장
# -----------------------------
feat_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# -----------------------------
# 7) 테스트셋 예측 결과 저장
# -----------------------------
results_df = X_test.copy()
results_df["Actual"] = y_test.values
results_df["Predicted"] = y_pred
results_df["Predicted_Prob"] = y_pred_proba

# -----------------------------
# 8) CSV 저장 (태블로용)
# -----------------------------
metrics_df.to_csv("threshold_metrics.csv", index=False)
roc_df.to_csv("roc_curve_data.csv", index=False)
feat_importance_df.to_csv("feature_importance.csv", index=False)
results_df.to_csv("predictions.csv", index=False)

print("✅ 태블로용 CSV 파일 저장 완료")
print(f"ROC-AUC Score: {roc_auc:.3f}")

Fitting 3 folds for each of 48 candidates, totalling 144 fits


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
        

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=['FALSE' 'REAL'] and y_pred=[1]. Make sure that the predictions provided by the classifier coincides with the true labels.