In [None]:
# 여러가지 시도(DI는 버리자)

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import optuna

import os

# 작업 디렉토리를 변경
os.chdir('/Users/yudayeon/Desktop/LGAimers')

# 변경된 디렉토리 확인
print("🔥 변경된 작업 디렉토리:", os.getcwd())

# 이제 상대 경로 사용 가능
X_train_encoded = pd.read_csv('data/train_encoded.csv')
X_test_encoded = pd.read_csv('data/test_encoded.csv')

# 시술 유형 컬럼명 설정 (실제 컬럼명이 다를 수도 있으니 확인 필요!)
procedure_column = "시술 유형"

# 시술 유형에 포함된 고유 값 출력
print("🔥 시술 유형별 고유 값 🔥")
print(X_train_encoded[procedure_column].unique())

# 시술 유형별 개수 확인
print("\n🔥 시술 유형별 개수 🔥")
print(X_train_encoded[procedure_column].value_counts())

🔥 변경된 작업 디렉토리: /Users/yudayeon/Desktop/LGAimers
🔥 시술 유형별 고유 값 🔥
[1. 0.]

🔥 시술 유형별 개수 🔥
시술 유형
1.0    250060
0.0      6291
Name: count, dtype: int64


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier

# ✅ 타겟 변수와 특성 분리
y = X_train_encoded['임신 성공 여부']
X = X_train_encoded.drop('임신 성공 여부', axis=1)

# ✅ 데이터 분할 (훈련 / 검증)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ✅ 최적 하이퍼파라미터 적용한 모델 정의
rf_model = RandomForestClassifier(
    n_estimators=215,
    max_depth=7,
    min_samples_split=2,
    min_samples_leaf=8,
    max_features=None,
    random_state=42
)

xgb_model = xgb.XGBClassifier(
    n_estimators=359,
    max_depth=5,
    learning_rate=0.045093620098282834,
    subsample=0.6669970717173888,
    colsample_bytree=0.934119080218835,
    random_state=42
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=492,
    num_leaves=88,
    learning_rate=0.014666949438705097,
    subsample=0.8766412273868395,
    colsample_bytree=0.5848756135604947,
    random_state=42
)

# ✅ 모델 학습
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)

# ✅ 각 모델의 예측 확률 계산
rf_preds = rf_model.predict_proba(X_val)[:, 1]
xgb_preds = xgb_model.predict_proba(X_val)[:, 1]
lgb_preds = lgb_model.predict_proba(X_val)[:, 1]

# ✅ 가중 평균 앙상블 (가중치 설정)
rf_weight = 0.4
xgb_weight = 0.3
lgb_weight = 0.3

final_preds = (rf_weight * rf_preds) + (xgb_weight * xgb_preds) + (lgb_weight * lgb_preds)
final_preds_binary = (final_preds >= 0.5).astype(int)  # 0.5 이상이면 1, 아니면 0


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 720
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
🔥 Confusion Matrix 🔥
[[36974  1051]
 [12000  1246]]

🔥 Classification Report 🔥
              precision    recall  f1-score   support

           0       0.75      0.97      0.85     38025
           1       0.54      0.09      0.16     13246

    accuracy                           0.75     51271
   macro avg       0.65      0.53      0.51     51271
weighted avg       0.70      0.75      0.67     51271



In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ✅ 기존 인코딩된 데이터 불러오기
X_train_encoded = pd.read_csv('data/train_encoded.csv')
X_test_encoded = pd.read_csv('data/test_encoded.csv')

# ✅ Target 변수 분리
y_train = X_train_encoded["임신 성공 여부"]  # 'target' 컬럼을 정답 레이블로 설정
X_train_encoded = X_train_encoded.drop(columns=["임신 성공 여부"])  # Feature만 남김

# ✅ 전체 데이터에서 8:2로 Train-Validation 분할 (Validation은 원본에서 20% 유지)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_encoded, y_train, test_size=0.2, random_state=42
)

# ✅ IVF(1) 데이터만 선택하여 학습 데이터로 사용
ivf_mask = (X_train_split["시술 유형"] == 1)
X_train_ivf = X_train_split[ivf_mask]
y_train_ivf = y_train_split[ivf_mask]

print(f"Train 데이터 수 (IVF=1): {X_train_ivf.shape[0]}")
print(f"Validation 데이터 수: {X_val.shape[0]}")

# ✅ RandomForest 모델 학습 (IVF(1) 데이터로만 학습)
rf_model = RandomForestClassifier(
    n_estimators=300, 
    max_depth=15, 
    min_samples_split=5,   # 클래스 불균형 해결
    random_state=42
)
rf_model.fit(X_train_ivf, y_train_ivf)


# ✅ 검증 데이터에 대한 성능 평가
print("\n🔥 Validation Set Performance 🔥")

y_val_pred_rf = rf_model.predict(X_val)

# ✅ 시술 유형이 DI(0)인 경우 강제로 0으로 설정
y_val_pred_rf[X_val["시술 유형"] == 0] = 0

# ✅ Confusion Matrix 출력
print("\n🔥 Confusion Matrix 🔥")
cm_rf = confusion_matrix(y_val, y_val_pred_rf)
print(cm_rf)

# ✅ Classification Report (소수점 4자리까지 출력)
print("\n🔥 Classification Report 🔥")
print(classification_report(y_val, y_val_pred_rf, digits=4))

Train 데이터 수 (IVF=1): 200045
Validation 데이터 수: 51271

🔥 Validation Set Performance 🔥

🔥 Confusion Matrix 🔥
[[37581   564]
 [12420   706]]

🔥 Classification Report 🔥
              precision    recall  f1-score   support

           0     0.7516    0.9852    0.8527     38145
           1     0.5559    0.0538    0.0981     13126

    accuracy                         0.7468     51271
   macro avg     0.6538    0.5195    0.4754     51271
weighted avg     0.7015    0.7468    0.6595     51271



In [29]:
rf_preds_test = rf_model.predict_proba(X_test_encoded)[:, 1]

# ✅ DI(0)인 경우 강제로 0으로 예측
final_preds_test_binary = (rf_preds_test >= 0.5).astype(int)
final_preds_test_binary[X_test_encoded["시술 유형"] == 0] = 0

# ✅ 제출 파일 로드
submission_path = '/Users/yudayeon/Desktop/LGAimers/submission/sample_submission.csv'
sample_submission = pd.read_csv(submission_path)

# ✅ 예측값 저장
sample_submission['probability'] = rf_preds_test

# ✅ CSV 파일로 저장
output_path = '/Users/yudayeon/Desktop/LGAimers/submission/baseline_submit333.csv'
sample_submission.to_csv(output_path, index=False)

print(f"\n✅ 예측 결과가 '{output_path}'에 저장되었습니다.")


✅ 예측 결과가 '/Users/yudayeon/Desktop/LGAimers/submission/baseline_submit333.csv'에 저장되었습니다.


In [35]:
ㅃ1 print(classification_report(y_val, y_val_pred_rf, digits=4))

Train 데이터 수 (IVF=1): 200045
Validation 데이터 수: 51271

🔥 Validation Set Performance 🔥

🔥 Confusion Matrix 🔥
[[36603  1542]
 [11563  1563]]

🔥 Classification Report 🔥
              precision    recall  f1-score   support

           0     0.7599    0.9596    0.8482     38145
           1     0.5034    0.1191    0.1926     13126

    accuracy                         0.7444     51271
   macro avg     0.6317    0.5393    0.5204     51271
weighted avg     0.6943    0.7444    0.6803     51271



In [45]:
####################################################################아몰랑
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ✅ 기존 인코딩된 데이터 불러오기
X_train_encoded = pd.read_csv('data/train_encoded.csv')
X_test_encoded = pd.read_csv('data/test_encoded.csv')

# ✅ Target 변수 분리
y_train = X_train_encoded["임신 성공 여부"]  # 'target' 컬럼을 정답 레이블로 설정
X_train_encoded = X_train_encoded.drop(columns=["임신 성공 여부"])  # Feature만 남김

# ✅ 전체 데이터에서 8:2로 Train-Validation 분할 (Validation은 원본에서 20% 유지)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_encoded, y_train, test_size=0.2, random_state=42
)

# ✅ IVF(1) 데이터만 선택하여 학습 데이터로 사용
ivf_mask = (X_train_split["시술 유형"] == 1)
X_train_ivf = X_train_split[ivf_mask]
y_train_ivf = y_train_split[ivf_mask]

print(f"Train 데이터 수 (IVF=1): {X_train_ivf.shape[0]}")
print(f"Validation 데이터 수: {X_val.shape[0]}")

# ✅ RandomForest 모델 학습 (IVF(1) 데이터로만 학습)
rf_model = RandomForestClassifier(
    n_estimators=500,  # 더 많은 트리 사용
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=5,  # 리프 노드 최소 샘플 수 지정하여 과적합 방지
    class_weight='balanced',  # 클래스 불균형 보정
    random_state=42
)
rf_model.fit(X_train_ivf, y_train_ivf)

# ✅ XGBoost 모델 학습 (선택 사항: 앙상블 모델 개선)
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=len(y_train_ivf[y_train_ivf == 0]) / len(y_train_ivf[y_train_ivf == 1]),
    random_state=42
)
xgb_model.fit(X_train_ivf, y_train_ivf)

# ✅ 검증 데이터에 대한 성능 평가
print("\n🔥 Validation Set Performance 🔥")

rf_preds_val = rf_model.predict_proba(X_val)[:, 1]
xgb_preds_val = xgb_model.predict_proba(X_val)[:, 1]

# ✅ 임계값 조정 (기본 0.5 -> 0.3으로 조정하여 Recall 향상)
threshold = 0.5
final_preds_val = (0.5 * rf_preds_val) + (0.5 * xgb_preds_val)
y_val_pred_rf = (final_preds_val >= threshold).astype(int)

# ✅ 시술 유형이 DI(0)인 경우 강제로 0으로 설정
y_val_pred_rf[X_val["시술 유형"] == 0] = 0

# ✅ Confusion Matrix 출력
print("\n🔥 Confusion Matrix 🔥")
cm_rf = confusion_matrix(y_val, y_val_pred_rf)
print(cm_rf)

# ✅ Classification Report (소수점 4자리까지 출력)
print("\n🔥 Classification Report 🔥")
print(classification_report(y_val, y_val_pred_rf, digits=4))

# ✅ 테스트 데이터에 대한 예측 수행
rf_preds_test = rf_model.predict_proba(X_test_encoded)[:, 1]
xgb_preds_test = xgb_model.predict_proba(X_test_encoded)[:, 1]

# ✅ 최종 앙상블 예측 (가중 평균)
final_preds_test = (0.5 * rf_preds_test) + (0.5 * xgb_preds_test)
final_preds_test_binary = (final_preds_test >= threshold).astype(int)
final_preds_test_binary[X_test_encoded["시술 유형"] == 0] = 0

# ✅ 제출 파일 로드
submission_path = '/Users/yudayeon/Desktop/LGAimers/submission/sample_submission.csv'
sample_submission = pd.read_csv(submission_path)

# ✅ 예측값 저장
sample_submission['probability'] = final_preds_test

# ✅ CSV 파일로 저장
output_path = '/Users/yudayeon/Desktop/LGAimers/submission/baseline_submit3.csv'
sample_submission.to_csv(output_path, index=False)

print(f"\n✅ 예측 결과가 '{output_path}'에 저장되었습니다.")


헬스케어 인공지능과 활용 

헬스케어 인공지능이란? 
헬스케어 인공지능의 영역
헬스케어 인공지능 시장의 현황 
헬스케어 패러다임 변화
헬스케어 인공지능 적용분야와 기술 
헬스케어 유망분야 
사용자에 따른 인공지능 적용분야와기술 
헹스케어에서의 인공지능 활용
헬스케어에서 인공지능의 가능성 현실화문제점
인공지능 보안의 위협



Train 데이터 수 (IVF=1): 200045
Validation 데이터 수: 51271

🔥 Validation Set Performance 🔥

🔥 Confusion Matrix 🔥
[[22961 15184]
 [ 3450  9676]]

🔥 Classification Report 🔥
              precision    recall  f1-score   support

           0     0.8694    0.6019    0.7114     38145
           1     0.3892    0.7372    0.5095     13126

    accuracy                         0.6366     51271
   macro avg     0.6293    0.6696    0.6104     51271
weighted avg     0.7464    0.6366    0.6597     51271


✅ 예측 결과가 '/Users/yudayeon/Desktop/LGAimers/submission/baseline_submit3.csv'에 저장되었습니다.


In [49]:
#에휴

In [51]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ✅ 기존 인코딩된 데이터 불러오기
X_train_encoded = pd.read_csv('data/train_encoded.csv')
X_test_encoded = pd.read_csv('data/test_encoded.csv')

# ✅ Target 변수 분리
y_train = X_train_encoded["임신 성공 여부"]  # 'target' 컬럼을 정답 레이블로 설정
X_train_encoded = X_train_encoded.drop(columns=["임신 성공 여부"])  # Feature만 남김

# ✅ Train-Validation 분할 (Validation은 20%)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_encoded, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# ✅ IVF(1) 데이터만 선택하여 학습 데이터로 사용
ivf_mask = (X_train_split["시술 유형"] == 1)
X_train_ivf = X_train_split[ivf_mask]
y_train_ivf = y_train_split[ivf_mask]

print(f"Train 데이터 수 (IVF=1): {X_train_ivf.shape[0]}")
print(f"Validation 데이터 수: {X_val.shape[0]}")

# ✅ 하이퍼파라미터 최적화를 위한 Optuna 함수 정의
def objective(trial):
    model_type = trial.suggest_categorical("model_type", ["RandomForest", "XGBoost", "LightGBM"])
    
    if model_type == "RandomForest":
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 500, step=50),
            max_depth=trial.suggest_int("max_depth", 5, 20),
            min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 5),
            random_state=42,
            n_jobs=-1
        )
    elif model_type == "XGBoost":
        model = XGBClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 500, step=50),
            max_depth=trial.suggest_int("max_depth", 3, 15),
            learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
            random_state=42,
            n_jobs=-1
        )
    elif model_type == "LightGBM":
        model = LGBMClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 500, step=50),
            max_depth=trial.suggest_int("max_depth", 3, 15),
            learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            num_leaves=trial.suggest_int("num_leaves", 10, 100),
            subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
            random_state=42
        )

    # ✅ Stratified K-Fold CV 적용
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, valid_idx in skf.split(X_train_ivf, y_train_ivf):
        X_train_fold, X_valid_fold = X_train_ivf.iloc[train_idx], X_train_ivf.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_train_ivf.iloc[train_idx], y_train_ivf.iloc[valid_idx]
        
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_valid_fold)
        scores.append(f1_score(y_valid_fold, y_pred))
    
    return np.mean(scores)

# ✅ Optuna 실행 (최적의 모델 찾기)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# ✅ 최적의 모델 출력
best_params = study.best_params
print("\n🔥 Best Hyperparameters 🔥")
print(best_params)

# ✅ 최적 모델 학습 및 평가
if best_params["model_type"] == "RandomForest":
    final_model = RandomForestClassifier(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        min_samples_split=best_params["min_samples_split"],
        min_samples_leaf=best_params["min_samples_leaf"],
        random_state=42,
        n_jobs=-1
    )
elif best_params["model_type"] == "XGBoost":
    final_model = XGBClassifier(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        learning_rate=best_params["learning_rate"],
        subsample=best_params["subsample"],
        colsample_bytree=best_params["colsample_bytree"],
        random_state=42,
        n_jobs=-1
    )
elif best_params["model_type"] == "LightGBM":
    final_model = LGBMClassifier(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        learning_rate=best_params["learning_rate"],
        num_leaves=best_params["num_leaves"],
        subsample=best_params["subsample"],
        colsample_bytree=best_params["colsample_bytree"],
        random_state=42
    )

# ✅ 최적 모델 훈련
final_model.fit(X_train_ivf, y_train_ivf)

# ✅ 검증 데이터 예측
y_val_pred = final_model.predict(X_val)

# ✅ 시술 유형이 DI(0)인 경우 강제로 0으로 설정
y_val_pred[X_val["시술 유형"] == 0] = 0

# ✅ 모델 성능 평가
print("\n🔥 Final Model Performance 🔥")
print(classification_report(y_val, y_val_pred, digits=4))

[I 2025-02-19 23:46:48,371] A new study created in memory with name: no-name-1d691c59-96e8-4d4c-b1e3-355fa587bd7d


Train 데이터 수 (IVF=1): 200039
Validation 데이터 수: 51271


  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
  colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),


[LightGBM] [Info] Number of positive: 41864, number of negative: 118167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018280 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 695
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.261599 -> initscore=-1.037673
[LightGBM] [Info] Start training from score -1.037673
[LightGBM] [Info] Number of positive: 41865, number of negative: 118166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020987 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info

[I 2025-02-19 23:46:56,190] Trial 0 finished with value: 0.22900250879355805 and parameters: {'model_type': 'LightGBM', 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.19727502294246585, 'num_leaves': 100, 'subsample': 0.6479639056818496, 'colsample_bytree': 0.9247991250547889}. Best is trial 0 with value: 0.22900250879355805.




[I 2025-02-19 23:47:50,254] Trial 1 finished with value: 0.053886171764338166 and parameters: {'model_type': 'RandomForest', 'n_estimators': 250, 'max_depth': 12, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.22900250879355805.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
  colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),


[LightGBM] [Info] Number of positive: 41864, number of negative: 118167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 695
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.261599 -> initscore=-1.037673
[LightGBM] [Info] Start training from score -1.037673
[LightGBM] [Info] Number of positive: 41865, number of negative: 118166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info

[I 2025-02-19 23:48:19,063] Trial 2 finished with value: 0.21282296808575346 and parameters: {'model_type': 'LightGBM', 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.11664997766173307, 'num_leaves': 82, 'subsample': 0.9909604922051969, 'colsample_bytree': 0.7563621375309375}. Best is trial 0 with value: 0.22900250879355805.




  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
  colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
[I 2025-02-19 23:48:35,720] Trial 3 finished with value: 0.2589178163586863 and parameters: {'model_type': 'XGBoost', 'n_estimators': 150, 'max_depth': 12, 'learning_rate': 0.12549331009072257, 'subsample': 0.9192858139622846, 'colsample_bytree': 0.9145848894920138}. Best is trial 3 with value: 0.2589178163586863.
[I 2025-02-19 23:49:52,425] Trial 4 finished with value: 0.1398473476361228 and parameters: {'model_type': 'RandomForest', 'n_estimators': 300, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.2589178163586863.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
  colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),


[LightGBM] [Info] Number of positive: 41864, number of negative: 118167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 695
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.261599 -> initscore=-1.037673
[LightGBM] [Info] Start training from score -1.037673
[LightGBM] [Info] Number of positive: 41865, number of negative: 118166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info

[I 2025-02-19 23:50:02,642] Trial 5 finished with value: 0.20294038466575462 and parameters: {'model_type': 'LightGBM', 'n_estimators': 150, 'max_depth': 10, 'learning_rate': 0.10534574955036437, 'num_leaves': 56, 'subsample': 0.6792897512422902, 'colsample_bytree': 0.6864539399710219}. Best is trial 3 with value: 0.2589178163586863.
[I 2025-02-19 23:50:49,443] Trial 6 finished with value: 0.051756415122318024 and parameters: {'model_type': 'RandomForest', 'n_estimators': 250, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.2589178163586863.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
  colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
[I 2025-02-19 23:51:31,355] Trial 7 finished with value: 0.30188689297945726 and parameters: {'model_type': 'XGBoost', 'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.16278807493917966, 'subsample

[LightGBM] [Info] Number of positive: 41864, number of negative: 118167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 695
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.261599 -> initscore=-1.037673
[LightGBM] [Info] Start training from score -1.037673
[LightGBM] [Info] Number of positive: 41865, number of negative: 118166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028855 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info

[I 2025-02-20 00:17:36,474] Trial 19 finished with value: 0.21014851801078765 and parameters: {'model_type': 'LightGBM', 'n_estimators': 450, 'max_depth': 6, 'learning_rate': 0.20742703659974276, 'num_leaves': 13, 'subsample': 0.7911104833439813, 'colsample_bytree': 0.7800170935279528}. Best is trial 12 with value: 0.34247214130946046.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
  colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
[I 2025-02-20 00:18:14,503] Trial 20 finished with value: 0.21460100567393342 and parameters: {'model_type': 'XGBoost', 'n_estimators': 350, 'max_depth': 11, 'learning_rate': 0.02315929712603991, 'subsample': 0.9078089437985957, 'colsample_bytree': 0.6492715529923977}. Best is trial 12 with value: 0.34247214130946046.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
  colsample_by

[LightGBM] [Info] Number of positive: 41864, number of negative: 118167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 695
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.261599 -> initscore=-1.037673
[LightGBM] [Info] Start training from score -1.037673
[LightGBM] [Info] Number of positive: 41865, number of negative: 118166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 160031, number of used features: 61
[LightGBM] [Info

[I 2025-02-20 00:26:54,194] Trial 29 finished with value: 0.2101695009729135 and parameters: {'model_type': 'LightGBM', 'n_estimators': 350, 'max_depth': 11, 'learning_rate': 0.15677472845749696, 'num_leaves': 25, 'subsample': 0.8206260118756996, 'colsample_bytree': 0.8019946341438818}. Best is trial 21 with value: 0.3437674044048072.



🔥 Best Hyperparameters 🔥
{'model_type': 'XGBoost', 'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.2965447421323962, 'subsample': 0.8261508025243548, 'colsample_bytree': 0.9760599209476746}

🔥 Final Model Performance 🔥
              precision    recall  f1-score   support

           0     0.7749    0.8457    0.8088     38025
           1     0.3997    0.2950    0.3394     13246

    accuracy                         0.7034     51271
   macro avg     0.5873    0.5703    0.5741     51271
weighted avg     0.6780    0.7034    0.6875     51271

