In [24]:
# Imports
import os
import numpy as np
import pandas as pd
import xgboost as xgb  # Import XGBoost
import shap  # Import the SHAP library
import matplotlib.pyplot as plt

# Modeling
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Reproducibility seeds
np.random.seed(87)


In [25]:
df = pd.read_csv('bc_data.csv' )
print(df.shape)
print(df.describe(include='all'))
print(df['Outcome'].value_counts())# summary stats

Unnamed: 0.1,Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,EXT_SOURCE_AVG,AGE,REGISTRATION_YEARS,ID_PUBLISH_YEARS,DAYS_EMPLOYED_ANOMALY
0,0,0,Cash loans,M,Y,N,1,225000.0,918468.0,28966.5,...,0.0,0.0,0.0,0.0,1.0,0.668497,38,1,13,False
1,1,0,Cash loans,M,Y,Y,0,157500.0,299772.0,20160.0,...,0.0,0.0,0.0,0.0,1.0,0.512601,23,9,3,False
2,2,0,Cash loans,M,N,N,0,108000.0,509602.5,26149.5,...,0.0,0.0,0.0,1.0,0.0,0.14924,35,17,10,False
3,3,0,Cash loans,F,N,Y,0,90000.0,225000.0,11074.5,...,0.0,0.0,0.0,0.0,0.0,0.556444,52,6,7,False
4,4,0,Cash loans,F,N,Y,0,112500.0,512064.0,25033.5,...,0.0,0.0,0.0,2.0,5.0,0.581288,30,21,7,False


In [26]:
# Turn categorical variables into dummy (0/1) columns
X = pd.get_dummies(df.drop(columns=["Outcome"]), drop_first=True)

# Target variable
Y = df["Outcome"]

# Split into train/test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.3,     # 30% test, 70% train
    random_state=99,   # reproducibility
    shuffle=True       # shuffle before splitting
)

# Check shapes
X_train.shape, X_test.shape

In [None]:

# 1) CV split (same as before)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)


# 3) Pipeline with XGBoost model
xgb_pipe = Pipeline([
    ("model", XGBClassifier(
        objective="binary:logistic",
        n_jobs=-1,          # use all cores
        eval_metric="auc",  # pairs well with scoring="roc_auc"
        tree_method="hist", # fast CPU histogram algorithm (use "gpu_hist" if you have a GPU)
        random_state=99
    ))
])

# 4) Hyperparameter grid (reasonable, compact search)
param_grid = {
    "model__n_estimators": [400, 800],
    "model__max_depth":    [4, 6, 8],
    "model__learning_rate":[0.05, 0.1],
    "model__subsample":    [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0],
    "model__min_child_weight": [1, 3]
}

# 5) Cross-validated grid search (same scoring)
xgb_cv = GridSearchCV(
    estimator=xgb_pipe,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1,
    refit=True
)

# 6) Fit with per-sample weights
xgb_cv.fit(X_train, Y_train)

print("Positive class used by AUC:", xgb_cv.best_estimator_.named_steps["model"].classes_[1])
print("Best params selected by CV:", xgb_cv.best_params_)
print("Best CV ROC-AUC:", round(xgb_cv.best_score_, 3))

In [None]:
# Get best fitted model from GridSearchCV
best_xgb = xgb_cv.best_estimator_.named_steps["model"]

# Get feature importances (aligned with columns of X_train)
importances = best_xgb.feature_importances_
features = X_train.columns

# Put in DataFrame for clarity
feat_imp = pd.DataFrame({
    "feature": features,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print(feat_imp.head(15))  # top 15 features

# Optional: plot top features
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
feat_imp.head(15).plot(kind="barh", x="feature", y="importance", legend=False)
plt.title("Top 15 Random Forest Feature Importances")
plt.gca().invert_yaxis()
plt.show()



In [None]:
#!pip install shap   # Install SHAP library if not already installed
import shap

# Create SHAP explainer for tree-based models (e.g., XGBoost, LightGBM, RandomForest)
explainer   = shap.TreeExplainer(best_xgb)

# Compute SHAP values for the training set
# Each value shows how much a feature pushes a prediction toward class 1 (positive) or class 0 (negative)
shap_values = explainer.shap_values(X_train)

# Overall PLot Beeswarm plot: this shows both which features matter most and the direction of their influence

shap.summary_plot(shap_values, X_train, max_display=15, show=True)

In [None]:
# Dependence plot for one feature
plt.scatter(X_train["Bare_Nuclei"], shap_values[:, X_train.columns.get_loc("Bare_Nuclei")])
plt.xlabel("Bare_Nuclei"); plt.ylabel("SHAP value"); plt.title("Dependence plot (Bare_Nuclei)")


# Dependence plot for one feature, colored by another feature
shap.dependence_plot("Bare_Nuclei", shap_values, X_train, interaction_index="Cell_Size_Uniformity", show=True)

In [None]:
# Identify top 3 most important features
mean_abs = np.abs(shap_values).mean(axis=0)

#Sort features by importance (descending order) and pick top 3
top_idx  = np.argsort(mean_abs)[::-1][:3] # you can add more too
top_feats = X_train.columns[top_idx]


# Dependence plots for top features
# Shows how the feature’s value relates to its SHAP impact
for f in top_feats:
    shap.dependence_plot(f, shap_values, X_train, show=True)

In [None]:
#Predictions set
#determine class order of your response variable
classes = xgb_cv.best_estimator_.named_steps["model"].classes_
print("Class order:", classes)

In [None]:

pos_idx = list(classes).index(1)   #Put in the class you are trying to predict
proba = xgb_cv.predict_proba(X_test)[:, pos_idx]


In [None]:
# ROC curve + AUC (tell roc_curve which label is positive)
pos_label=1
fpr, tpr, _ = roc_curve(Y_test, proba, pos_label=pos_label)
roc_auc = roc_auc_score((Y_test == pos_label).astype(int), proba)
print(f"ROC AUC (test): {roc_auc:.3f}")
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Random Forest)")
plt.show()