In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import shap



In [2]:
# Show all rows when printing
pd.set_option("display.max_rows", None)

In [10]:
# Load data
df = pd.read_csv("../data/processed/train_data_new_v8.csv")
X = df.drop(columns=['high_booking_rate'])
y = df['high_booking_rate']



In [11]:
# Train-test split (same as your code)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)



In [None]:
# XGBoost model
model = XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',
        random_state=42,
        colsample_bytree=0.8,
        learning_rate=0.01,
        max_depth=9,
        n_estimators=3500,
        subsample=0.8,
        scale_pos_weight=1.5,
        gamma=0,
        min_child_weight=1
    )
model.fit(X_train, y_train)

# Evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC: {auc_score:.4f}")

# ---------------- SHAP ----------------
# Use TreeExplainer for XGBoost
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)

# Summary Plot (top 20 features)
shap.summary_plot(shap_values, X_test, max_display=20)

# Optional: Dependency Plot for top feature
top_feature = X_test.columns[shap_values.abs.values.mean(0).argmax()]
shap.dependence_plot(top_feature, shap_values.values, X_test)


# # Show all feature importances
# feature_importance = pd.Series(model.feature_importances_, index=X.columns)
# feature_importance = feature_importance.sort_values(ascending=False)

# print("\nFull Feature Importances (sorted):\n")
# print(feature_importance)

# # Optional: plot (keep it if you want a visual glance at the spread)
# plt.figure(figsize=(10, max(6, len(feature_importance) // 3)))
# feature_importance.plot(kind='barh')
# plt.title("All Feature Importances (XGBoost)")
# plt.tight_layout()
# plt.show()

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC: 0.9127


  9%|==                  | 2148/23017 [09:45<94:43]        

In [6]:
# Load data
df = pd.read_csv("../data/processed/train_data_new_v7.csv")
X = df.drop(columns=['high_booking_rate'])
y = df['high_booking_rate']



In [7]:
# Train-test split (same as your code)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [None]:
# XGBoost model
model = XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',
        random_state=42,
        colsample_bytree=0.8,
        learning_rate=0.01,
        max_depth=9,
        n_estimators=3500,
        subsample=0.8,
        scale_pos_weight=1.5,
        gamma=0,
        min_child_weight=1
    )
model.fit(X_train, y_train)

# Evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC: {auc_score:.4f}")


# ---------------- SHAP ----------------
# Use TreeExplainer for XGBoost
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)

# Summary Plot (top 20 features)
shap.summary_plot(shap_values, X_test, max_display=20)

# Optional: Dependency Plot for top feature
top_feature = X_test.columns[shap_values.abs.values.mean(0).argmax()]
shap.dependence_plot(top_feature, shap_values.values, X_test)

# # Show all feature importances
# feature_importance = pd.Series(model.feature_importances_, index=X.columns)
# feature_importance = feature_importance.sort_values(ascending=False)

# print("\nFull Feature Importances (sorted):\n")
# print(feature_importance)

# # Optional: plot (keep it if you want a visual glance at the spread)
# plt.figure(figsize=(10, max(6, len(feature_importance) // 3)))
# feature_importance.plot(kind='barh')
# plt.title("All Feature Importances (XGBoost)")
# plt.tight_layout()
# plt.show()