In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import shap
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
    precision_recall_curve,
    classification_report
)
import json
import os

MODEL_PATH = r"C:\Users\kamog\Downloads\final_model_Random Forest_20251106_223359.joblib"
TEST_DATA_PATH = r"C:\Users\kamog\Downloads\dataset\test_data2.csv"
OUTPUT_DIR = r"C:\Users\kamog\Downloads\dataset\dashboard_data"

os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Loading model and test data")

model = joblib.load(MODEL_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

TARGET_COL = "Fraud"

X_test = test_data.drop(columns=[TARGET_COL])
y_test = test_data[TARGET_COL]

print("Generating predictions and calculating metrics")

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1_Score": f1_score(y_test, y_pred),
    "ROC_AUC": roc_auc_score(y_test, y_pred_proba)
}

# Save metrics
with open(os.path.join(OUTPUT_DIR, "model_metrics.json"), "w") as f:
    json.dump(metrics, f, indent=4)

print("\nModel Performance:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(OUTPUT_DIR, "classification_report.csv"))

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm,
                     index=["Actual: No Fraud", "Actual: Fraud"],
                     columns=["Predicted: No Fraud", "Predicted: Fraud"])
cm_df.to_csv(os.path.join(OUTPUT_DIR, "confusion_matrix.csv"))

plt.figure(figsize=(5, 4))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"))
plt.close()

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(5, 4))
plt.plot(fpr, tpr, label=f"AUC = {metrics['ROC_AUC']:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "roc_curve.png"))
plt.close()

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
plt.figure(figsize=(5, 4))
plt.plot(recall, precision, color="purple")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "precision_recall_curve.png"))
plt.close()

feature_importances = pd.DataFrame({
    "Feature": X_test.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importances.to_csv(os.path.join(OUTPUT_DIR, "feature_importance.csv"), index=False)

plt.figure(figsize=(8, 5))
sns.barplot(data=feature_importances.head(15), x="Importance", y="Feature", palette="viridis")
plt.title("Top 15 Most Important Features")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "feature_importance.png"))
plt.close()

print("Running SHAP explainability analysis...")

# Take a small representative sample
SAMPLE_SIZE = min(300, len(X_test))
X_sample = X_test.sample(SAMPLE_SIZE, random_state=42)

explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
shap_values = explainer.shap_values(X_sample)

# Compute mean absolute SHAP for summary
mean_abs_shap = np.abs(shap_values[1]).mean(axis=0)
shap_summary = pd.DataFrame({
    "Feature": X_sample.columns,
    "Mean_Abs_SHAP": mean_abs_shap
}).sort_values(by="Mean_Abs_SHAP", ascending=False)

shap_summary.to_csv(os.path.join(OUTPUT_DIR, "shap_summary.csv"), index=False)

# Plots
plt.figure()
shap.summary_plot(shap_values[1], X_sample, show=False, plot_size=(10, 6))
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "shap_summary_plot.png"))
plt.close()

plt.figure()
shap.summary_plot(shap_values[1], X_sample, show=False, plot_type="bar", plot_size=(10, 6))
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "shap_bar_plot.png"))
plt.close()

print(f"SHAP analysis completed on {SAMPLE_SIZE} samples.")

Loading model and test data
Generating predictions and calculating metrics

Model Performance:
Accuracy: 0.9488
Precision: 0.9253
Recall: 0.9417
F1_Score: 0.9335
ROC_AUC: 0.9916




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



Running SHAP explainability analysis...
SHAP analysis completed on 300 samples.


In [1]:
import pandas as pd
import joblib
import os

MODEL_PATH = r"C:\Users\kamog\Downloads\dataset\final_model_Random Forest_20251106_223359.joblib"
TEST_DATA_PATH = r"C:\Users\kamog\Downloads\dataset\test_data2.csv"
OUTPUT_DIR = r"C:\Users\kamog\Downloads\dataset\dashboard_data"
TARGET_COL = "Fraud" 

print("Loading model and test data...")
model = joblib.load(MODEL_PATH)
df = pd.read_csv(TEST_DATA_PATH)

X_test = df.drop(columns=[TARGET_COL])
y_test = df[TARGET_COL]

print("Generating predictions...")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

predictions_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred,
    "Fraud_Probability": y_pred_proba
})

os.makedirs(OUTPUT_DIR, exist_ok=True)
predictions_df.to_csv(os.path.join(OUTPUT_DIR, "predictions.csv"), index=False)

print(f"predictions.csv saved in '{OUTPUT_DIR}/' — {len(predictions_df)} rows")

Loading model and test data...
Generating predictions...
predictions.csv saved in 'C:\Users\kamog\Downloads\dataset\dashboard_data/' — 55822 rows
