In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Cài đặt 
!pip install xgboost
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1
import warnings
warnings.simplefilter('ignore', FutureWarning)

Collecting imbalanced-learn==0.10.1
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.13.0
    Uninstalling imbalanced-learn-0.13.0:
      Successfully uninstalled imbalanced-learn-0.13.0
Successfully installed imbalanced-learn-0.10.1


In [2]:
import pandas as pd

df01 = pd.read_csv('/kaggle/input/clean-aws-month/final_data/filled_data_april.csv')
df02 = pd.read_csv('/kaggle/input/clean-aws-month/final_data/filled_data_october.csv')

# Ghép 2 file lại với nhau
data_filled = pd.concat([df01, df02], ignore_index=True)

# 1. Xử lý dữ liệu bị thiếu

In [3]:
# df_cleaned = data.copy()
# df_cleaned = df_cleaned.drop(columns=['B04B', 'B05B', 'B06B', 'VSB', 'CIN'])

# df_cleaned = df_cleaned[~((df_cleaned['AWS'] == -np.inf) | (df_cleaned['AWS'].isnull()))]
# df_cleaned = df_cleaned[~((df_cleaned['SLHF'] == 9999) | (df_cleaned['SSHF'] == 9999))]

In [4]:
# data_filled = df_cleaned.fillna(df_cleaned.median(numeric_only=True))

# 2. Chia train test

In [5]:
def split_data_by_multiple_ranges(df, train_ranges):
    train_mask = False
    for start, end in train_ranges:
        train_mask |= (df['datetime'] >= start) & (df['datetime'] < end)
    train_df = df[train_mask]
    test_df = df[~train_mask]
    return train_df, test_df

def convert_rain_label(df):
    df['AWS'] = df['AWS'].apply(lambda x: 1 if x > 0 else 0)
    return df

# Gắn nhãn
df_all1 = convert_rain_label(data_filled)

# Chuyển datetime nếu chưa
df_all1['datetime'] = pd.to_datetime(df_all1['datetime'])

# Chọn các khoảng train: tháng 4/2019, 10/2019, 4/2020
train_ranges = [
    ("2019-04-01", "2019-04-30"),
    ("2019-10-01", "2019-10-31"),
    ("2020-04-01", "2020-04-30"),
]

# Tách train/test theo mốc trên
train_df, test_df = split_data_by_multiple_ranges(df_all1, train_ranges)

# Giữ lại chỉ test tháng 10/2020
test_df = test_df[
    (test_df['datetime'] >= "2020-10-01") & (test_df['datetime'] <= "2020-10-31")
]

# Kết quả
print(f"Train set: {train_df.shape}")
print(f"Test set (October 2020): {test_df.shape}")

Train set: (428242, 33)
Test set (October 2020): (207094, 33)


In [6]:
X_train = train_df.drop(columns=['AWS', 'datetime', 'row','col'])
y_train = train_df['AWS']

X_test = test_df.drop(columns=['AWS', 'datetime', 'row','col'])
y_test = test_df['AWS']

# 3. Chuẩn hóa

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# 4. Chọn feature

In [8]:
top_features = ['B14B', 'I2B', 'TCLW', 'R500', 'R850', 'CAPE', 'U850', 'B10B', 'PEV', 'SLOR', 'R250', 'KX', 'V250', 'U250', 'TCWV']

In [9]:
# X_train_selected = X_train_scaled[top_features]
# X_test_selected = X_test_scaled[top_features]

In [10]:
#Performance evaluation
def print_scores(y_true, y_pred):
  print(classification_report(y_true, y_pred))

# 5. Thêm class weight

In [11]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200)
model_xgb.fit(X_train, y_train)

y_pred = model_xgb.predict(X_test)
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.85      0.93      0.89    159566
           1       0.66      0.47      0.55     47528

    accuracy                           0.82    207094
   macro avg       0.76      0.70      0.72    207094
weighted avg       0.81      0.82      0.81    207094



In [12]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200)
model_xgb.fit(X_train_scaled, y_train)

y_pred = model_xgb.predict(X_test_scaled)
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.85      0.93      0.89    159566
           1       0.66      0.47      0.55     47528

    accuracy                           0.82    207094
   macro avg       0.76      0.70      0.72    207094
weighted avg       0.81      0.82      0.81    207094



In [13]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200, scale_pos_weight=11.32)
model_xgb.fit(X_train_scaled, y_train)

y_pred = model_xgb.predict(X_test_scaled)
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.77      0.83    159566
           1       0.48      0.71      0.58     47528

    accuracy                           0.76    207094
   macro avg       0.69      0.74      0.70    207094
weighted avg       0.80      0.76      0.77    207094



In [14]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200, scale_pos_weight=11.32)
model_xgb.fit(X_train_scaled[top_features], y_train)

y_pred = model_xgb.predict(X_test_scaled[top_features])
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.86      0.76      0.81    159566
           1       0.43      0.60      0.50     47528

    accuracy                           0.72    207094
   macro avg       0.64      0.68      0.65    207094
weighted avg       0.76      0.72      0.74    207094



=> Giải pháp:

1. **Giữ mô hình đã tối ưu + threshold = 0.6** để triển khai hoặc đánh giá cuối cùng.
2. Thử thêm:

   * Tối ưu `max_depth`, `gamma`, `min_child_weight`, `subsample` bằng GridSearchCV hoặc Optuna.
   * Dùng **cross-validation phân tầng (StratifiedKFold)** để đánh giá mô hình ổn định hơn.
3. Nếu cần **giải thích mô hình**, có thể dùng SHAP để giải thích đóng góp các feature.

**Tuning XGBoost không chọn đặc trưng với optuna**

In [15]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=11.32,
    random_state=42
)
model_xgb.fit(X_train_scaled, y_train)


y_pred = model_xgb.predict(X_test_scaled)
print_scores( y_test, y_pred)

y_probs = model_xgb.predict_proba(X_test_scaled)[:,1]

best_thresh = 0.0
best_f1 = 0.0
for t in np.arange(0.1, 0.9, 0.05):
    y_pred = (y_probs >= t).astype(int)
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold: {best_thresh}, F1: {best_f1}")


              precision    recall  f1-score   support

           0       0.97      0.65      0.78    159566
           1       0.44      0.93      0.60     47528

    accuracy                           0.72    207094
   macro avg       0.71      0.79      0.69    207094
weighted avg       0.85      0.72      0.74    207094

Best threshold: 0.7000000000000002, F1: 0.6357567891037204


In [16]:
import optuna
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb


THRESHOLD = best_thresh
SCALE_POS_WEIGHT = 11.32 

def objective(trial):
    # Thử các tham số
    param = {
        'n_estimators': 200,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'scale_pos_weight': SCALE_POS_WEIGHT,
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train_scaled, y_train)

    # Dự đoán xác suất
    y_probs = model.predict_proba(X_test_scaled)[:, 1]
    y_pred = (y_probs >= THRESHOLD).astype(int)

    # Trả về F1-score của lớp mưa (label=1)
    return f1_score(y_test, y_pred, pos_label=1)

# Tối ưu với Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Có thể tăng trials lên 100+

# In kết quả tốt nhất
print("Best trial:")
print(study.best_trial)
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")

# Huấn luyện lại với tham số tốt nhất
best_params = study.best_trial.params
best_params.update({
    'n_estimators': 200,
    'scale_pos_weight': SCALE_POS_WEIGHT,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
})

best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train_scaled, y_train)

# Dự đoán với threshold
y_probs = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_thresh = (y_probs >= THRESHOLD).astype(int)

# In kết quả
from sklearn.metrics import classification_report
print(f"\nClassification report with threshold = {THRESHOLD}:")
print(classification_report(y_test, y_pred_thresh))

[I 2025-05-04 03:58:07,842] A new study created in memory with name: no-name-e376fad6-f30c-48fd-873c-d9ad988f03d0
[I 2025-05-04 03:58:18,522] Trial 0 finished with value: 0.5837567697551387 and parameters: {'learning_rate': 0.12917314409259603, 'max_depth': 9, 'min_child_weight': 10, 'subsample': 0.6877735807070782, 'colsample_bytree': 0.7806369223507554, 'gamma': 1.0603062289311964}. Best is trial 0 with value: 0.5837567697551387.
[I 2025-05-04 03:58:31,434] Trial 1 finished with value: 0.5279821778167826 and parameters: {'learning_rate': 0.10441659759468691, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.9124454375621692, 'colsample_bytree': 0.8395259105310269, 'gamma': 3.1208685454310485}. Best is trial 0 with value: 0.5837567697551387.
[I 2025-05-04 03:58:38,027] Trial 2 finished with value: 0.6600131275560865 and parameters: {'learning_rate': 0.1660740667868559, 'max_depth': 6, 'min_child_weight': 8, 'subsample': 0.9554404477221969, 'colsample_bytree': 0.8510629944282961, '

Best trial:
FrozenTrial(number=3, state=1, values=[0.6681908582501827], datetime_start=datetime.datetime(2025, 5, 4, 3, 58, 38, 28118), datetime_complete=datetime.datetime(2025, 5, 4, 3, 58, 45, 851666), params={'learning_rate': 0.14566812504662993, 'max_depth': 7, 'min_child_weight': 9, 'subsample': 0.6454889121369269, 'colsample_bytree': 0.7285396267841306, 'gamma': 3.917903582852751}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=3, value=None)
Best hyperparameters:
learning_rate: 0.14566812504662993
max_depth: 7
min_ch

**Tuning XGBoost có chọn đặc trưng với optuna**

In [17]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=11.32,
    random_state=42
)
model_xgb.fit(X_train_scaled[top_features], y_train)


y_pred = model_xgb.predict(X_test_scaled[top_features])
print_scores( y_test, y_pred)

y_probs = model_xgb.predict_proba(X_test_scaled[top_features])[:,1]

best_thresh = 0.0
best_f1 = 0.0
for t in np.arange(0.1, 0.9, 0.05):
    y_pred = (y_probs >= t).astype(int)
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold: {best_thresh}, F1: {best_f1}")

              precision    recall  f1-score   support

           0       0.97      0.62      0.75    159566
           1       0.42      0.93      0.58     47528

    accuracy                           0.69    207094
   macro avg       0.69      0.78      0.67    207094
weighted avg       0.84      0.69      0.71    207094

Best threshold: 0.7500000000000002, F1: 0.6257327664585624


In [18]:
import optuna
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb


THRESHOLD = best_thresh
SCALE_POS_WEIGHT = 11.32 

def objective(trial):
    # Thử các tham số
    param = {
        'n_estimators': 200,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'scale_pos_weight': SCALE_POS_WEIGHT,
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train_scaled[top_features], y_train)

    # Dự đoán xác suất
    y_probs = model.predict_proba(X_test_scaled[top_features])[:, 1]
    y_pred = (y_probs >= THRESHOLD).astype(int)

    # Trả về F1-score của lớp mưa (label=1)
    return f1_score(y_test, y_pred, pos_label=1)

# Tối ưu với Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Có thể tăng trials lên 100+

# In kết quả tốt nhất
print("Best trial:")
print(study.best_trial)
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")

# Huấn luyện lại với tham số tốt nhất
best_params = study.best_trial.params
best_params.update({
    'n_estimators': 200,
    'scale_pos_weight': SCALE_POS_WEIGHT,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
})

best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train_scaled, y_train)

# Dự đoán với threshold
y_probs = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_thresh = (y_probs >= THRESHOLD).astype(int)

# In kết quả
from sklearn.metrics import classification_report
# print("\nClassification report with threshold = 0.75:")
print(f"\nClassification report with threshold = {THRESHOLD}:")
print(classification_report(y_test, y_pred_thresh))

[I 2025-05-04 04:06:24,421] A new study created in memory with name: no-name-3e475a75-5c6e-4fde-8879-ed3490bc9bd9
[I 2025-05-04 04:06:30,979] Trial 0 finished with value: 0.6517376512639421 and parameters: {'learning_rate': 0.05501805548093067, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.9281113947048185, 'colsample_bytree': 0.6904772711215474, 'gamma': 3.8101173736328446}. Best is trial 0 with value: 0.6517376512639421.
[I 2025-05-04 04:06:36,376] Trial 1 finished with value: 0.6447731529258877 and parameters: {'learning_rate': 0.021532949352925854, 'max_depth': 6, 'min_child_weight': 6, 'subsample': 0.7739285737167391, 'colsample_bytree': 0.8454808722709726, 'gamma': 4.790983412951543}. Best is trial 0 with value: 0.6517376512639421.
[I 2025-05-04 04:06:40,151] Trial 2 finished with value: 0.6381947240842946 and parameters: {'learning_rate': 0.16072264453401341, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.7267155543365673, 'colsample_bytree': 0.6392656494318474, '

Best trial:
FrozenTrial(number=42, state=1, values=[0.6819692601502102], datetime_start=datetime.datetime(2025, 5, 4, 4, 10, 17, 664556), datetime_complete=datetime.datetime(2025, 5, 4, 4, 10, 28, 295777), params={'learning_rate': 0.011141169284554567, 'max_depth': 10, 'min_child_weight': 10, 'subsample': 0.770168196195663, 'colsample_bytree': 0.5980076206534548, 'gamma': 4.509153333740827}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=42, value=None)
Best hyperparameters:
learning_rate: 0.011141169284554567
max_depth: 10

**Đánh giá:**

Lớp mưa:
* Recall = 0.80: Mô hình phát hiện được 80% các trường hợp có mưa → rất tốt trong bối cảnh nhãn lệch (khó học).
* F1-score = 0.70: Mức F1-score cao hơn mô hình mặc định (~0.57) hoặc các cấu hình thủ công trước đó.
* Precision = 0.61: Có một số false positives (dự đoán sai là mưa)

Lớp không mưa: Rất tốt, vẫn giữ precision 0.93 và F1-score 0.89

**Không bắt buộc phải cân bằng dữ liệu khi dùng XGBoost vì đã dùng scale_pos_weight, threshold và tham số bằng optuna) nhưng vẫn nên thử**

=> Chọn đặc trưng hay không thì kết quả cũng tương tự nhau.

# 6. Cân bằng dữ liệu (Oversampling)

## 6.1 SMOTE

In [19]:
# SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE()

X_sm, y_sm = sm.fit_resample(X_train, y_train)
X_sm.shape, y_sm.shape

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_sm)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [20]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200)
model_xgb.fit(X_sm, y_sm)

y_pred = model_xgb.predict(X_test)
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.89      0.85      0.87    159566
           1       0.56      0.66      0.61     47528

    accuracy                           0.81    207094
   macro avg       0.73      0.75      0.74    207094
weighted avg       0.82      0.81      0.81    207094



In [21]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200)
model_xgb.fit(X_train_scaled, y_sm)

y_pred = model_xgb.predict(X_test_scaled)
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.87      0.87      0.87    159566
           1       0.57      0.58      0.58     47528

    accuracy                           0.80    207094
   macro avg       0.72      0.72      0.72    207094
weighted avg       0.80      0.80      0.80    207094



In [22]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200)
model_xgb.fit(X_train_scaled[top_features], y_sm)

y_pred = model_xgb.predict(X_test_scaled[top_features])
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.89      0.84      0.86    159566
           1       0.54      0.65      0.59     47528

    accuracy                           0.79    207094
   macro avg       0.72      0.74      0.73    207094
weighted avg       0.81      0.79      0.80    207094



In [23]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200, min_child_weight=5)
model_xgb.fit(X_train_scaled, y_sm)

y_pred = model_xgb.predict(X_test_scaled)
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.85      0.88    159566
           1       0.58      0.67      0.62     47528

    accuracy                           0.81    207094
   macro avg       0.74      0.76      0.75    207094
weighted avg       0.82      0.81      0.82    207094



=> Thêm min_child_weight có cải thiện

In [24]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200, scale_pos_weight=11.32)
model_xgb.fit(X_train_scaled, y_sm)

y_pred = model_xgb.predict(X_test_scaled)
print_scores( y_test, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.73      0.81    159566
           1       0.45      0.73      0.56     47528

    accuracy                           0.73    207094
   macro avg       0.68      0.73      0.68    207094
weighted avg       0.80      0.73      0.75    207094



=> Thêm scale_pos_weight giảm

In [25]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    # scale_pos_weight=11.32,
    random_state=42,
    min_child_weight=5
    
)

model_xgb.fit(X_train_scaled, y_sm)

y_pred = model_xgb.predict(X_test_scaled)
print_scores(y_test, y_pred)

y_probs = model_xgb.predict_proba(X_test_scaled)[:,1]

best_thresh = 0.0
best_f1 = 0.0
for t in np.arange(0.1, 0.9, 0.05):
    y_pred = (y_probs >= t).astype(int)
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold: {best_thresh}, F1: {best_f1}")

              precision    recall  f1-score   support

           0       0.93      0.81      0.87    159566
           1       0.55      0.79      0.65     47528

    accuracy                           0.81    207094
   macro avg       0.74      0.80      0.76    207094
weighted avg       0.84      0.81      0.82    207094

Best threshold: 0.5500000000000002, F1: 0.6522934690260636


In [26]:
import optuna
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

THRESHOLD = best_thresh
# SCALE_POS_WEIGHT = 11.32 

def objective(trial):
    # Thử các tham số
    param = {
        'n_estimators': 200,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        # 'scale_pos_weight': SCALE_POS_WEIGHT,
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train_scaled, y_sm)

    # Dự đoán xác suất
    y_probs = model.predict_proba(X_test_scaled)[:, 1]
    y_pred = (y_probs >= THRESHOLD).astype(int)

    # Trả về F1-score của lớp mưa (label=1)
    return f1_score(y_test, y_pred, pos_label=1)


# Tối ưu với Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # Có thể tăng trials lên 100+

# In kết quả tốt nhất
print("Best trial:")
print(study.best_trial)
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")

# Huấn luyện lại với tham số tốt nhất
best_params = study.best_trial.params
best_params.update({
    'n_estimators': 200,
    # 'scale_pos_weight': SCALE_POS_WEIGHT,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
})

best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train_scaled, y_sm)

# Dự đoán với threshold
y_probs = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_thresh = (y_probs >= THRESHOLD).astype(int)

# In kết quả
from sklearn.metrics import classification_report
print(f"\nClassification report with threshold = {THRESHOLD}:")
print(classification_report(y_test, y_pred_thresh))

[I 2025-05-04 04:13:03,248] A new study created in memory with name: no-name-d52f9b2d-1e12-43dd-bd17-36608483284d
[I 2025-05-04 04:13:20,637] Trial 0 finished with value: 0.6062899920538664 and parameters: {'learning_rate': 0.11062457482259704, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.9975969928966021, 'colsample_bytree': 0.877056036092343, 'gamma': 2.9717146301115527}. Best is trial 0 with value: 0.6062899920538664.
[I 2025-05-04 04:13:34,869] Trial 1 finished with value: 0.6555413388506247 and parameters: {'learning_rate': 0.09190919201932185, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.9775060773980844, 'colsample_bytree': 0.5633930057326808, 'gamma': 4.945860882234348}. Best is trial 1 with value: 0.6555413388506247.
[I 2025-05-04 04:13:52,323] Trial 2 finished with value: 0.5619798761432708 and parameters: {'learning_rate': 0.17655065310260842, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.673222961066432, 'colsample_bytree': 0.9909510445868477, 'gam

Best trial:
FrozenTrial(number=1, state=1, values=[0.6555413388506247], datetime_start=datetime.datetime(2025, 5, 4, 4, 13, 20, 638281), datetime_complete=datetime.datetime(2025, 5, 4, 4, 13, 34, 869321), params={'learning_rate': 0.09190919201932185, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.9775060773980844, 'colsample_bytree': 0.5633930057326808, 'gamma': 4.945860882234348}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=1, value=None)
Best hyperparameters:
learning_rate: 0.09190919201932185
max_depth: 7
min_c

In [27]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    # scale_pos_weight=11.32,
    random_state=42,
    min_child_weight=5
    
)

model_xgb.fit(X_train_scaled[top_features], y_sm)

y_pred = model_xgb.predict(X_test_scaled[top_features])
print_scores(y_test, y_pred)

y_probs = model_xgb.predict_proba(X_test_scaled[top_features])[:,1]

best_thresh = 0.0
best_f1 = 0.0
for t in np.arange(0.1, 0.9, 0.05):
    y_pred = (y_probs >= t).astype(int)
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold: {best_thresh}, F1: {best_f1}")

              precision    recall  f1-score   support

           0       0.93      0.77      0.84    159566
           1       0.51      0.79      0.62     47528

    accuracy                           0.78    207094
   macro avg       0.72      0.78      0.73    207094
weighted avg       0.83      0.78      0.79    207094

Best threshold: 0.5500000000000002, F1: 0.6221290384065569


In [28]:
import optuna
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb


THRESHOLD = best_thresh
# SCALE_POS_WEIGHT = 11.32 

def objective(trial):
    # Thử các tham số
    param = {
        'n_estimators': 200,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        # 'scale_pos_weight': SCALE_POS_WEIGHT,
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train_scaled[top_features], y_sm)

    # Dự đoán xác suất
    y_probs = model.predict_proba(X_test_scaled[top_features])[:, 1]
    y_pred = (y_probs >= THRESHOLD).astype(int)

    # Trả về F1-score của lớp mưa (label=1)
    return f1_score(y_test, y_pred, pos_label=1)


# Tối ưu với Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # Có thể tăng trials lên 100+

# In kết quả tốt nhất
print("Best trial:")
print(study.best_trial)
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")

# Huấn luyện lại với tham số tốt nhất
best_params = study.best_trial.params
best_params.update({
    'n_estimators': 200,
    # 'scale_pos_weight': SCALE_POS_WEIGHT,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
})

best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train_scaled[top_features], y_sm)

# Dự đoán với threshold
y_probs = best_model.predict_proba(X_test_scaled[top_features])[:, 1]
y_pred_thresh = (y_probs >= THRESHOLD).astype(int)

# In kết quả
from sklearn.metrics import classification_report
print(f"\nClassification report with threshold = {THRESHOLD}:")
print(classification_report(y_test, y_pred_thresh))

[I 2025-05-04 04:20:50,786] A new study created in memory with name: no-name-625cde8b-b432-4fab-af4e-dac9cf0e937f
[I 2025-05-04 04:20:57,934] Trial 0 finished with value: 0.6243075821186653 and parameters: {'learning_rate': 0.069851239162954, 'max_depth': 4, 'min_child_weight': 7, 'subsample': 0.9570542853394668, 'colsample_bytree': 0.9948635984206922, 'gamma': 3.3253010977094166}. Best is trial 0 with value: 0.6243075821186653.
[I 2025-05-04 04:21:09,721] Trial 1 finished with value: 0.49735637801572846 and parameters: {'learning_rate': 0.17663118151584933, 'max_depth': 9, 'min_child_weight': 10, 'subsample': 0.8683556718337033, 'colsample_bytree': 0.7019010949541067, 'gamma': 0.6815190847116609}. Best is trial 0 with value: 0.6243075821186653.
[I 2025-05-04 04:21:16,383] Trial 2 finished with value: 0.5902333732551136 and parameters: {'learning_rate': 0.011253036969929795, 'max_depth': 3, 'min_child_weight': 10, 'subsample': 0.529323343362218, 'colsample_bytree': 0.8695413804868146, 

Best trial:
FrozenTrial(number=15, state=1, values=[0.6364667273758182], datetime_start=datetime.datetime(2025, 5, 4, 4, 23, 3, 308004), datetime_complete=datetime.datetime(2025, 5, 4, 4, 23, 11, 843284), params={'learning_rate': 0.10897418069897816, 'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.7984487060260363, 'colsample_bytree': 0.592385534450679, 'gamma': 4.082380111470863}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=15, value=None)
Best hyperparameters:
learning_rate: 0.10897418069897816
max_depth: 6
min_c

**Đối với dùng SMOTE:** chỉ scale cao hơn scale xong chọn đặc trưng nhưng không SMOTE cao hơn đã SMOTE.

## 6.2 SMOTE ENN

In [29]:
# SMOTE ENN
from imblearn.combine import SMOTEENN

smenn = SMOTEENN()

X_smenn, y_smenn = smenn.fit_resample(X_train, y_train)
X_smenn.shape, y_sm.shape

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_smenn)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [30]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200)
model_xgb.fit(X_smenn, y_smenn)

y_pred = model_xgb.predict(X_test)
print_scores(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.84      0.87    159566
           1       0.55      0.68      0.61     47528

    accuracy                           0.80    207094
   macro avg       0.73      0.76      0.74    207094
weighted avg       0.82      0.80      0.81    207094



In [31]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200)
model_xgb.fit(X_train_scaled, y_smenn)

y_pred = model_xgb.predict(X_test_scaled)
print_scores(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.91      0.83      0.87    159566
           1       0.56      0.71      0.63     47528

    accuracy                           0.81    207094
   macro avg       0.73      0.77      0.75    207094
weighted avg       0.83      0.81      0.81    207094



In [32]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200)
model_xgb.fit(X_train_scaled[top_features], y_smenn)

y_pred = model_xgb.predict(X_test_scaled[top_features])
print_scores(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.89      0.81      0.85    159566
           1       0.51      0.68      0.58     47528

    accuracy                           0.78    207094
   macro avg       0.70      0.74      0.71    207094
weighted avg       0.80      0.78      0.79    207094



=> Không chọn đặc trưng, scale hay không cũng như nhau.

In [33]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42, n_estimators = 200, min_child_weight=5)
model_xgb.fit(X_train_scaled, y_smenn)

y_pred = model_xgb.predict(X_test_scaled)
print_scores(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.83      0.86    159566
           1       0.55      0.70      0.62     47528

    accuracy                           0.80    207094
   macro avg       0.73      0.77      0.74    207094
weighted avg       0.82      0.80      0.81    207094



In [34]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    # scale_pos_weight=11.32,
    random_state=42
)
model_xgb.fit(X_train_scaled, y_smenn)

y_pred = model_xgb.predict(X_test_scaled)
print_scores( y_test, y_pred)

y_probs = model_xgb.predict_proba(X_test_scaled)[:,1]

best_thresh = 0.0
best_f1 = 0.0
for t in np.arange(0.1, 0.9, 0.05):
    y_pred = (y_probs >= t).astype(int)
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold: {best_thresh}, F1: {best_f1}")

              precision    recall  f1-score   support

           0       0.95      0.76      0.84    159566
           1       0.52      0.86      0.64     47528

    accuracy                           0.78    207094
   macro avg       0.73      0.81      0.74    207094
weighted avg       0.85      0.78      0.80    207094

Best threshold: 0.7000000000000002, F1: 0.6607958855197676


In [35]:
import optuna
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

THRESHOLD = best_thresh
# SCALE_POS_WEIGHT = 11.32

def objective(trial):
    # Thử các tham số
    param = {
        'n_estimators': 200,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        # 'scale_pos_weight': SCALE_POS_WEIGHT,
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train_scaled, y_smenn)

    # Dự đoán xác suất
    y_probs = model.predict_proba(X_test_scaled)[:, 1]
    y_pred = (y_probs >= THRESHOLD).astype(int)

    # Trả về F1-score của lớp mưa (label=1)
    return f1_score(y_test, y_pred, pos_label=1)


# Tối ưu với Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # Có thể tăng trials lên 100+

# In kết quả tốt nhất
print("Best trial:")
print(study.best_trial)
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")

# Huấn luyện lại với tham số tốt nhất
best_params = study.best_trial.params
best_params.update({
    'n_estimators': 200,
    # 'scale_pos_weight': SCALE_POS_WEIGHT,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
})

best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train_scaled, y_smenn)

# Dự đoán với threshold 
y_probs = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_thresh = (y_probs >= THRESHOLD).astype(int)

# In kết quả
from sklearn.metrics import classification_report
print(f"\nClassification report with threshold = {THRESHOLD}:")
print(classification_report(y_test, y_pred_thresh))

[I 2025-05-04 04:53:52,771] A new study created in memory with name: no-name-1fb200fe-0772-4d23-983b-24ac96e5e6c6
[I 2025-05-04 04:54:01,222] Trial 0 finished with value: 0.6451217670062603 and parameters: {'learning_rate': 0.10173756277259545, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.7505247483175883, 'colsample_bytree': 0.5442605960408098, 'gamma': 3.2374930930885117}. Best is trial 0 with value: 0.6451217670062603.
[I 2025-05-04 04:54:13,928] Trial 1 finished with value: 0.5697583966321954 and parameters: {'learning_rate': 0.2667013479035525, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.8689759940338972, 'colsample_bytree': 0.9112882239957628, 'gamma': 1.9757545464427357}. Best is trial 0 with value: 0.6451217670062603.
[I 2025-05-04 04:54:26,688] Trial 2 finished with value: 0.6068104331820368 and parameters: {'learning_rate': 0.26630603080984805, 'max_depth': 9, 'min_child_weight': 7, 'subsample': 0.7482758440863413, 'colsample_bytree': 0.7278139326859422, 'g

Best trial:
FrozenTrial(number=22, state=1, values=[0.6635276106383736], datetime_start=datetime.datetime(2025, 5, 4, 4, 58, 6, 294344), datetime_complete=datetime.datetime(2025, 5, 4, 4, 58, 14, 571990), params={'learning_rate': 0.15831976529718217, 'max_depth': 4, 'min_child_weight': 8, 'subsample': 0.5781021981205159, 'colsample_bytree': 0.776009200044843, 'gamma': 0.3723572466435703}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=22, value=None)
Best hyperparameters:
learning_rate: 0.15831976529718217
max_depth: 4
min_

In [36]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    # scale_pos_weight=11.32,
    random_state=42
)
model_xgb.fit(X_train_scaled[top_features], y_smenn)


y_pred = model_xgb.predict(X_test_scaled[top_features])
print_scores(y_test, y_pred)

y_probs = model_xgb.predict_proba(X_test_scaled[top_features])[:,1]

best_thresh = 0.0
best_f1 = 0.0
for t in np.arange(0.1, 0.9, 0.05):
    y_pred = (y_probs >= t).astype(int)
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold: {best_thresh}, F1: {best_f1}")

              precision    recall  f1-score   support

           0       0.95      0.72      0.82    159566
           1       0.48      0.87      0.62     47528

    accuracy                           0.75    207094
   macro avg       0.71      0.79      0.72    207094
weighted avg       0.84      0.75      0.77    207094

Best threshold: 0.7500000000000002, F1: 0.6505599104143338


In [37]:
import optuna
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

THRESHOLD = best_thresh
# SCALE_POS_WEIGHT = 11.32

def objective(trial):
    # Thử các tham số
    param = {
        'n_estimators': 200,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        # 'scale_pos_weight': SCALE_POS_WEIGHT,
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train_scaled[top_features], y_smenn)

    # Dự đoán xác suất
    y_probs = model.predict_proba(X_test_scaled[top_features])[:, 1]
    y_pred = (y_probs >= THRESHOLD).astype(int)

    # Trả về F1-score của lớp mưa (label=1)
    return f1_score(y_test, y_pred, pos_label=1)


# Tối ưu với Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # Có thể tăng trials lên 100+

# In kết quả tốt nhất
print("Best trial:")
print(study.best_trial)
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")

# Huấn luyện lại với tham số tốt nhất
best_params = study.best_trial.params
best_params.update({
    'n_estimators': 200,
    # 'scale_pos_weight': SCALE_POS_WEIGHT,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
})

best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train_scaled[top_features], y_smenn)

# Dự đoán với threshold
y_probs = best_model.predict_proba(X_test_scaled[top_features])[:, 1]
y_pred_thresh = (y_probs >= THRESHOLD).astype(int)

# In kết quả
from sklearn.metrics import classification_report
print(f"\nClassification report with threshold = {THRESHOLD}:")
print(classification_report(y_test, y_pred_thresh))

[I 2025-05-04 04:59:33,910] A new study created in memory with name: no-name-e3d8a4fb-d246-4867-aabf-f9274090d51f
[I 2025-05-04 04:59:44,506] Trial 0 finished with value: 0.49340167451353295 and parameters: {'learning_rate': 0.23999753530562806, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.5069463319353236, 'colsample_bytree': 0.56205689541778, 'gamma': 1.5645382833343264}. Best is trial 0 with value: 0.49340167451353295.
[I 2025-05-04 04:59:53,814] Trial 1 finished with value: 0.5831251734665557 and parameters: {'learning_rate': 0.07911671646508799, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.6529614109903809, 'colsample_bytree': 0.6919299226229674, 'gamma': 2.073126153197962}. Best is trial 1 with value: 0.5831251734665557.
[I 2025-05-04 05:00:00,943] Trial 2 finished with value: 0.6098351962144082 and parameters: {'learning_rate': 0.13377602262172292, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.7242269093063951, 'colsample_bytree': 0.5578146540992005, 'g

Best trial:
FrozenTrial(number=19, state=1, values=[0.6492858923674182], datetime_start=datetime.datetime(2025, 5, 4, 5, 1, 52, 651932), datetime_complete=datetime.datetime(2025, 5, 4, 5, 1, 59, 52578), params={'learning_rate': 0.08663175933599757, 'max_depth': 5, 'min_child_weight': 7, 'subsample': 0.6817556451733409, 'colsample_bytree': 0.8567862350060883, 'gamma': 3.8828372418848125}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=19, value=None)
Best hyperparameters:
learning_rate: 0.08663175933599757
max_depth: 5
min_c

# 7. Kết luận

Mô hình XGBoost ban đầu (không SMOTE hay SMOTE ENN) là tốt nhất.