In [None]:
# Split the data into train, valid, and test sets
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import plotly.express as px

In [None]:
# === Load Data ===
import pandas as pd
X = pd.read_csv("X_transaction_data.csv")
y = pd.read_csv("y_transaction_labels.csv")

In [None]:
# Split into train (80%), validation (10%), test (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# === EDA ===
print("Shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_valid: {X_valid.shape}, y_valid: {y_valid.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

print("\nMissing Values:")
print(X_train.isnull().sum().sort_values(ascending=False).head())

In [None]:
plt.figure(figsize=(12, 10))
corr_matrix = X_train.select_dtypes(include='number').corr()
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False)
plt.title("Correlation Matrix")
plt.show()

In [None]:
pd.set_option('mode.use_inf_as_na', True)
# Convert the categorical features to numerical for pairplot (for visualization purposes)
X_eda = X_train.copy()
X_eda['type'] = X_eda['type'].astype('category').cat.codes
X_eda['sender_location'] = X_eda['sender_location'].astype('category').cat.codes
X_eda['receiver_location'] = X_eda['receiver_location'].astype('category').cat.codes
X_eda['time_of_day'] = X_eda['time_of_day'].astype('category').cat.codes
X_eda['device_type'] = X_eda['device_type'].astype('category').cat.codes
X_eda['category'] = X_eda['category'].astype('category').cat.codes

# Combine features with the target for pairplot
X_eda['fraudulent'] = y

# Pairplot for numerical relationships
sns.pairplot(X_eda, hue='fraudulent', diag_kind='kde', plot_kws={'alpha': 0.5})
plt.title("Pairplot for Transaction Data (Colored by Fraudulent Transactions)")
plt.show()

In [None]:
# Combine train + valid
X_trainval = np.vstack([X_train.select_dtypes(include='number'), X_valid.select_dtypes(include='number')])
y_trainval = np.vstack([y_train, y_valid])

In [None]:
# === Define TNR Scorer ===
def tnr_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp + 1e-10)

# === Hyperopt Objective Function ===
def objective_rf(params):
    clf = RandomForestClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, X_trainval, y_trainval, cv=cv, method='predict')
    score = tnr_score(y_trainval, y_pred)
    return {'loss': 1 - score, 'status': STATUS_OK, 'tnr': score}

In [None]:
# === Hyperopt Search Space ===
space_rf = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1),
}

In [None]:
# === Run Hyperopt ===
trials_rf = Trials()
best_rf = fmin(
    fn=objective_rf,
    space=space_rf,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials_rf,
    rstate=np.random.default_rng(42)
)
print("\nBest RF Hyperparameters:")
print(best_rf)

In [None]:
# === Parallel Coordinates Plot ===
results = []
for trial in trials_rf.trials:
    vals = trial['misc']['vals']
    results.append({
        'n_estimators': int(vals['n_estimators'][0]),
        'max_depth': int(vals['max_depth'][0]),
        'min_samples_split': int(vals['min_samples_split'][0]),
        'min_samples_leaf': int(vals['min_samples_leaf'][0]),
        'TNR': 1 - trial['result']['loss']
    })
df_rf = pd.DataFrame(results)

#cool hack
pd.DataFrame.iteritems = pd.DataFrame.items

fig = px.parallel_coordinates(
    df_rf,
    color="TNR",
    labels={
        "n_estimators": "n_estimators",
        "max_depth": "max_depth",
        "min_samples_split": "min_samples_split",
        "min_samples_leaf": "min_samples_leaf",
        "TNR": "TNR"
    },
    color_continuous_scale=px.colors.sequential.Viridis,
)
fig.update_layout(title="Random Forest Hyperparameter Tuning (TNR Optimized)")
fig.show()

In [None]:
# === Final Evaluation ===
best_rf_clean = {
    'n_estimators': int(best_rf['n_estimators']),
    'max_depth': int(best_rf['max_depth']),
    'min_samples_split': int(best_rf['min_samples_split']),
    'min_samples_leaf': int(best_rf['min_samples_leaf']),
}

final_rf = RandomForestClassifier(
    **best_rf_clean,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

final_rf.fit(X_trainval, y_trainval)
y_pred_rf = final_rf.predict(X_test.select_dtypes(include='number')) # only num values

cm = confusion_matrix(y_test, y_pred_rf)
tn, fp, fn, tp = cm.ravel()
tnr = tn / (tn + fp + 1e-10)

print("\nConfusion Matrix:")
print(cm)
print(f"\nTrue Negative Rate (TNR): {tnr:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

In [None]:
# === Feature Importance ===
feature_names = X_train.select_dtypes(include='number').columns
importances = final_rf.feature_importances_
feat_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feat_df = feat_df.sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feat_df.head(15), x="Importance", y="Feature")
plt.title("Top 15 Feature Importances (RF)")
plt.show()

In [None]:
# === SHAP Plot ===
import shap
explainer = shap.Explainer(final_rf, X_trainval, feature_names=feature_names)
shap_values = explainer(X_test.select_dtypes(include='number'))

# SHAP summary plot
shap.summary_plot(shap_values, features=X_test.select_dtypes(include='number'), feature_names=feature_names, plot_type="bar")
shap.summary_plot(shap_values, features=X_test.select_dtypes(include='number'), feature_names=feature_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score

In [None]:
# === Predict Probabilities ===
y_proba_rf = final_rf.predict_proba(X_test.select_dtypes(include='number'))[:, 1]

In [None]:
# === ROC Curve and AUC ===
fpr, tpr, roc_thresholds = roc_curve(y_test, y_proba_rf)
roc_auc = roc_auc_score(y_test, y_proba_rf)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()

In [None]:
# === Precision-Recall Curve and PR AUC ===
precision, recall, pr_thresholds = precision_recall_curve(y_test, y_proba_rf)
pr_auc = average_precision_score(y_test, y_proba_rf)

plt.figure(figsize=(6, 5))
plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.3f}", color="green")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid()
plt.show()

In [None]:
# === TNR vs. Threshold Plot ===
thresholds = np.linspace(0.0, 1.0, 200)
tnr_list = []

for thresh in thresholds:
    preds = (y_proba_rf >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    tnr = tn / (tn + fp + 1e-10)
    tnr_list.append(tnr)

plt.figure(figsize=(7, 5))
plt.plot(thresholds, tnr_list, color='purple')
plt.xlabel("Threshold")
plt.ylabel("True Negative Rate (TNR)")
plt.title("TNR vs. Classification Threshold")
plt.grid()
plt.show()

In [None]:
# === TNR vs. Threshold Plot ===
thresholds = np.linspace(0.0, 1.0, 200)
fnr_list = []

for thresh in thresholds:
    preds = (y_proba_rf >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    fnr = tn / (tn + fp + 1e-10)
    fnr_list.append(fnr)

plt.figure(figsize=(7, 5))
plt.plot(thresholds, fnr_list, color='red')
plt.xlabel("Threshold")
plt.ylabel("False Negative Rate (FNR)")
plt.title("FNR vs. Classification Threshold")
plt.grid()
plt.show()

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

# === Load Data ===
#X_train = pd.read_csv("X_train_transaction_data.csv")
#y_train = pd.read_csv("y_train_transaction_labels.csv")["fraudulent"]
#X_valid = pd.read_csv("X_valid_transaction_data.csv")
#y_valid = pd.read_csv("y_valid_transaction_labels.csv")["fraudulent"]
#X_test = pd.read_csv("X_test_transaction_data.csv")
#y_test = pd.read_csv("y_test_transaction_labels.csv")["fraudulent"]

# === EDA ===
print("Shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

#sns.countplot(x=y_train)
#plt.title("Class Distribution in y_train")
#plt.show()

X_corr = X_train.copy()
for col in X_corr.select_dtypes(include='object'):
    X_corr[col] = X_corr[col].astype("category").cat.codes

plt.figure(figsize=(10, 8))
sns.heatmap(X_corr.corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

X_pair = X_corr.copy()
X_pair["fraudulent"] = y_train
sns.pairplot(X_pair.sample(500), hue="fraudulent", diag_kind="kde")
plt.suptitle("Pairplot (500 samples)", y=1.02)
plt.show()

# === Preprocessing ===
cat_cols = X_train.select_dtypes(include="object").columns
for col in cat_cols:
    for df in [X_train, X_valid, X_test]:
        df[col] = df[col].astype("category").cat.codes

imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_train_imp = imputer.fit_transform(X_train)
X_valid_imp = imputer.transform(X_valid)
X_test_imp = imputer.transform(X_test)

X_train_scaled = scaler.fit_transform(X_train_imp)
X_valid_scaled = scaler.transform(X_valid_imp)
X_test_scaled = scaler.transform(X_test_imp)

X_trainval = np.vstack([X_train_scaled, X_valid_scaled])
y_trainval = np.vstack([y_train, y_valid])

# === Define TNR Scorer ===
def tnr_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp + 1e-10)

# === Hyperopt for Random Forest ===
def objective_rf(params):
    clf = RandomForestClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, X_trainval, y_trainval, cv=cv)
    return {'loss': 1 - tnr_score(y_trainval, y_pred), 'status': STATUS_OK}

space_rf = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1)
}

trials_rf = Trials()
best_rf = fmin(fn=objective_rf, space=space_rf, algo=tpe.suggest, max_evals=50, trials=trials_rf, rstate=np.random.default_rng(42))

# === Train Final RF ===
best_params_rf = {
    'n_estimators': int(best_rf['n_estimators']),
    'max_depth': int(best_rf['max_depth']),
    'min_samples_split': int(best_rf['min_samples_split']),
    'min_samples_leaf': int(best_rf['min_samples_leaf'])
}

final_rf = RandomForestClassifier(**best_params_rf, class_weight='balanced', random_state=42, n_jobs=-1)
final_rf.fit(X_trainval, y_trainval)
y_pred_rf = final_rf.predict(X_test_scaled)
y_proba_rf = final_rf.predict_proba(X_test_scaled)[:, 1]

# === Evaluation ===
cm = confusion_matrix(y_test, y_pred_rf)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:\n", cm)
print(f"TNR: {tn / (tn + fp + 1e-10):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# === Feature Importance ===
feat_df = pd.DataFrame({"Feature": X_train.columns, "Importance": final_rf.feature_importances_})
feat_df = feat_df.sort_values(by="Importance", ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(data=feat_df.head(15), x="Importance", y="Feature")
plt.title("Top 15 Feature Importances (RF)")
plt.show()

# === SHAP ===
explainer = shap.Explainer(final_rf, X_trainval)
shap_values = explainer(X_test_scaled)
shap.summary_plot(shap_values, features=X_test_scaled, feature_names=X_train.columns, plot_type="bar")
shap.summary_plot(shap_values, features=X_test_scaled, feature_names=X_train.columns)

# === Threshold Sensitivity ===
fpr, tpr, _ = roc_curve(y_test, y_proba_rf)
roc_auc = roc_auc_score(y_test, y_proba_rf)
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.3f}")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()

precision, recall, _ = precision_recall_curve(y_test, y_proba_rf)
pr_auc = average_precision_score(y_test, y_proba_rf)
plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.3f}", color='green')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid()
plt.show()

thresholds = np.linspace(0.0, 1.0, 200)
tnr_list = []
fnr_list = []
for thresh in thresholds:
    preds = (y_proba_rf >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    tnr_list.append(tn / (tn + fp + 1e-10))
    fnr_list.append(fn / (fn + tp + 1e-10))

plt.plot(thresholds, tnr_list, color='purple')
plt.xlabel("Threshold")
plt.ylabel("TNR")
plt.title("TNR vs. Threshold")
plt.grid()
plt.show()

plt.plot(thresholds, fnr_list, color='red')
plt.xlabel("Threshold")
plt.ylabel("FNR")
plt.title("FNR vs. Threshold")
plt.grid()
plt.show()

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import lightgbm as lgb
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import warnings
warnings.filterwarnings("ignore")

# === Load Data ===
X_train = pd.read_csv("X_train_transaction_data.csv")
y_train = pd.read_csv("y_train_transaction_labels.csv")["fraudulent"]
X_valid = pd.read_csv("X_valid_transaction_data.csv")
y_valid = pd.read_csv("y_valid_transaction_labels.csv")["fraudulent"]
X_test = pd.read_csv("X_test_transaction_data.csv")
y_test = pd.read_csv("y_test_transaction_labels.csv")["fraudulent"]

# === EDA ===
print("Shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

sns.countplot(x=y_train)
plt.title("Class Distribution in y_train")
plt.show()

X_corr = X_train.copy()
for col in X_corr.select_dtypes(include='object'):
    X_corr[col] = X_corr[col].astype("category").cat.codes

plt.figure(figsize=(10, 8))
sns.heatmap(X_corr.corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

X_pair = X_corr.copy()
X_pair["fraudulent"] = y_train
sns.pairplot(X_pair.sample(500), hue="fraudulent", diag_kind="kde")
plt.suptitle("Pairplot (500 samples)", y=1.02)
plt.show()

# === Preprocessing ===
cat_cols = X_train.select_dtypes(include="object").columns
for col in cat_cols:
    for df in [X_train, X_valid, X_test]:
        df[col] = df[col].astype("category").cat.codes

imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_train_imp = imputer.fit_transform(X_train)
X_valid_imp = imputer.transform(X_valid)
X_test_imp = imputer.transform(X_test)

X_train_scaled = scaler.fit_transform(X_train_imp)
X_valid_scaled = scaler.transform(X_valid_imp)
X_test_scaled = scaler.transform(X_test_imp)

X_trainval = np.vstack([X_train_scaled, X_valid_scaled])
y_trainval = np.hstack([y_train, y_valid])

# === Define TNR Scorer ===
def tnr_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp + 1e-10)

# === Hyperopt for LightGBM ===
def objective_lgb(params):
    clf = lgb.LGBMClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        scale_pos_weight=params['scale_pos_weight'],
        random_state=42,
        n_jobs=-1
    )
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, X_trainval, y_trainval, cv=cv)
    return {'loss': 1 - tnr_score(y_trainval, y_pred), 'status': STATUS_OK}

space_lgb = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10)
}

trials_lgb = Trials()
best_lgb = fmin(fn=objective_lgb, space=space_lgb, algo=tpe.suggest, max_evals=50, trials=trials_lgb, rstate=np.random.default_rng(42))

# === Train Final LightGBM ===
best_params_lgb = {
    'n_estimators': int(best_lgb['n_estimators']),
    'max_depth': int(best_lgb['max_depth']),
    'learning_rate': best_lgb['learning_rate'],
    'subsample': best_lgb['subsample'],
    'colsample_bytree': best_lgb['colsample_bytree'],
    'scale_pos_weight': best_lgb['scale_pos_weight']
}

final_lgb = lgb.LGBMClassifier(**best_params_lgb, random_state=42, n_jobs=-1)
final_lgb.fit(X_trainval, y_trainval)
y_pred_lgb = final_lgb.predict(X_test_scaled)
y_proba_lgb = final_lgb.predict_proba(X_test_scaled)[:, 1]

# === Evaluation ===
cm = confusion_matrix(y_test, y_pred_lgb)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:\n", cm)
print(f"TNR: {tn / (tn + fp + 1e-10):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_lgb))

# === Feature Importance ===
feat_df = pd.DataFrame({"Feature": X_train.columns, "Importance": final_lgb.feature_importances_})
feat_df = feat_df.sort_values(by="Importance", ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(data=feat_df.head(15), x="Importance", y="Feature")
plt.title("Top 15 Feature Importances (LightGBM)")
plt.show()

# === SHAP ===
explainer = shap.Explainer(final_lgb, X_trainval)
shap_values = explainer(X_test_scaled)
shap.summary_plot(shap_values, features=X_test_scaled, feature_names=X_train.columns, plot_type="bar")
shap.summary_plot(shap_values, features=X_test_scaled, feature_names=X_train.columns)

# === Threshold Sensitivity ===
fpr, tpr, _ = roc_curve(y_test, y_proba_lgb)
roc_auc = roc_auc_score(y_test, y_proba_lgb)
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.3f}")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()

precision, recall, _ = precision_recall_curve(y_test, y_proba_lgb)
pr_auc = average_precision_score(y_test, y_proba_lgb)
plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.3f}", color='green')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid()
plt.show()

thresholds = np.linspace(0.0, 1.0, 200)
tnr_list = []
fnr_list = []
for thresh in thresholds:
    preds = (y_proba_lgb >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    tnr_list.append(tn / (tn + fp + 1e-10))
    fnr_list.append(fn / (fn + tp + 1e-10))

plt.plot(thresholds, tnr_list, color='purple')
plt.xlabel("Threshold")
plt.ylabel("TNR")
plt.title("TNR vs. Threshold")
plt.grid()
plt.show()

plt.plot(thresholds, fnr_list, color='red')
plt.xlabel("Threshold")
plt.ylabel("FNR")
plt.title("FNR vs. Threshold")
plt.grid()
plt.show()

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import warnings
warnings.filterwarnings("ignore")

# === Load Data ===
X_train = pd.read_csv("X_train_transaction_data.csv")
y_train = pd.read_csv("y_train_transaction_labels.csv")["fraudulent"]
X_valid = pd.read_csv("X_valid_transaction_data.csv")
y_valid = pd.read_csv("y_valid_transaction_labels.csv")["fraudulent"]
X_test = pd.read_csv("X_test_transaction_data.csv")
y_test = pd.read_csv("y_test_transaction_labels.csv")["fraudulent"]

# === EDA ===
print("Shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

sns.countplot(x=y_train)
plt.title("Class Distribution in y_train")
plt.show()

X_corr = X_train.copy()
for col in X_corr.select_dtypes(include='object'):
    X_corr[col] = X_corr[col].astype("category").cat.codes

plt.figure(figsize=(10, 8))
sns.heatmap(X_corr.corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

X_pair = X_corr.copy()
X_pair["fraudulent"] = y_train
sns.pairplot(X_pair.sample(500), hue="fraudulent", diag_kind="kde")
plt.suptitle("Pairplot (500 samples)", y=1.02)
plt.show()

# === Preprocessing ===
cat_cols = X_train.select_dtypes(include="object").columns
for col in cat_cols:
    for df in [X_train, X_valid, X_test]:
        df[col] = df[col].astype("category").cat.codes

imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_train_imp = imputer.fit_transform(X_train)
X_valid_imp = imputer.transform(X_valid)
X_test_imp = imputer.transform(X_test)

X_train_scaled = scaler.fit_transform(X_train_imp)
X_valid_scaled = scaler.transform(X_valid_imp)
X_test_scaled = scaler.transform(X_test_imp)

X_trainval = np.vstack([X_train_scaled, X_valid_scaled])
y_trainval = np.hstack([y_train, y_valid])

# === Define TNR Scorer ===
def tnr_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp + 1e-10)

# === Hyperopt for XGBoost ===
def objective_xgb(params):
    clf = XGBClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        scale_pos_weight=params['scale_pos_weight'],
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    )
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, X_trainval, y_trainval, cv=cv)
    return {'loss': 1 - tnr_score(y_trainval, y_pred), 'status': STATUS_OK}

space_xgb = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10)
}

trials_xgb = Trials()
best_xgb = fmin(fn=objective_xgb, space=space_xgb, algo=tpe.suggest, max_evals=50, trials=trials_xgb, rstate=np.random.default_rng(42))

# === Train Final XGBoost ===
best_params_xgb = {
    'n_estimators': int(best_xgb['n_estimators']),
    'max_depth': int(best_xgb['max_depth']),
    'learning_rate': best_xgb['learning_rate'],
    'subsample': best_xgb['subsample'],
    'colsample_bytree': best_xgb['colsample_bytree'],
    'scale_pos_weight': best_xgb['scale_pos_weight']
}

final_xgb = XGBClassifier(**best_params_xgb, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
final_xgb.fit(X_trainval, y_trainval)
y_pred_xgb = final_xgb.predict(X_test_scaled)
y_proba_xgb = final_xgb.predict_proba(X_test_scaled)[:, 1]

# === Evaluation ===
cm = confusion_matrix(y_test, y_pred_xgb)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:\n", cm)
print(f"TNR: {tn / (tn + fp + 1e-10):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))

# === Feature Importance ===
feat_df = pd.DataFrame({"Feature": X_train.columns, "Importance": final_xgb.feature_importances_})
feat_df = feat_df.sort_values(by="Importance", ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(data=feat_df.head(15), x="Importance", y="Feature")
plt.title("Top 15 Feature Importances (XGBoost)")
plt.show()

# === SHAP ===
explainer = shap.Explainer(final_xgb)
shap_values = explainer(X_test_scaled)
shap.summary_plot(shap_values, features=X_test_scaled, feature_names=X_train.columns, plot_type="bar")
shap.summary_plot(shap_values, features=X_test_scaled, feature_names=X_train.columns)

# === Threshold Sensitivity ===
fpr, tpr, _ = roc_curve(y_test, y_proba_xgb)
roc_auc = roc_auc_score(y_test, y_proba_xgb)
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.3f}")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()

precision, recall, _ = precision_recall_curve(y_test, y_proba_xgb)
pr_auc = average_precision_score(y_test, y_proba_xgb)
plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.3f}", color='green')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid()
plt.show()

thresholds = np.linspace(0.0, 1.0, 200)
tnr_list = []
fnr_list = []
for thresh in thresholds:
    preds = (y_proba_xgb >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    tnr_list.append(tn / (tn + fp + 1e-10))
    fnr_list.append(fn / (fn + tp + 1e-10))

plt.plot(thresholds, tnr_list, color='purple')
plt.xlabel("Threshold")
plt.ylabel("TNR")
plt.title("TNR vs. Threshold")
plt.grid()
plt.show()

plt.plot(thresholds, fnr_list, color='red')
plt.xlabel("Threshold")
plt.ylabel("FNR")
plt.title("FNR vs. Threshold")
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Example metrics per model — replace with your actual calculated values
metrics = {
    'RF':    {'Precision': 0.88, 'Recall': 0.70, 'Accuracy': 0.91, 'TNR': 0.97, 'FNR': 0.30, 'TPR': 0.70, 'FPR': 0.03, 'ROC AUC': 0.93, 'PR AUC': 0.78},
    'XGB':   {'Precision': 0.86, 'Recall': 0.72, 'Accuracy': 0.90, 'TNR': 0.96, 'FNR': 0.28, 'TPR': 0.72, 'FPR': 0.04, 'ROC AUC': 0.92, 'PR AUC': 0.80},
    'LGBM':  {'Precision': 0.89, 'Recall': 0.75, 'Accuracy': 0.92, 'TNR': 0.98, 'FNR': 0.25, 'TPR': 0.75, 'FPR': 0.02, 'ROC AUC': 0.94, 'PR AUC': 0.83},
}

labels = list(metrics['RF'].keys())
models = list(metrics.keys())

angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
angles += angles[:1]  # repeat first angle for closing the plot

fig, ax = plt.subplots(figsize=(8, 6), subplot_kw=dict(polar=True))
for model in models:
    values = [metrics[model][m] for m in labels]
    values += values[:1]  # close the circle
    ax.plot(angles, values, label=model)
    ax.fill(angles, values, alpha=0.1)

ax.set_thetagrids(np.degrees(angles[:-1]), labels)
ax.set_title("Model Comparison Radar Chart", size=14)
ax.grid(True)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.show()


In [None]:
# === Imports ===
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, precision_recall_curve, average_precision_score, roc_curve, confusion_matrix

# === Placeholder Predictions (load your actual predictions here) ===
# Replace with real probability outputs from your models on the same test set
#y_test = pd.read_csv("y_test_transaction_labels.csv")["fraudulent"]

# Example placeholders (replace with your predictions)
#y_proba_rf = pd.read_csv("rf_probs.csv")["prob"]
#y_proba_xgb = pd.read_csv("xgb_probs.csv")["prob"]
#y_proba_lgb = pd.read_csv("lgb_probs.csv")["prob"]

# === Compute Metrics ===
def compute_metrics(y_true, y_proba):
    auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    preds = (y_proba > 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, preds).ravel()
    tnr = tn / (tn + fp + 1e-10)
    return auc, pr_auc, tnr

models = {
    "Random Forest": y_proba_rf,
    "XGBoost": y_proba_xgb,
    "LightGBM": y_proba_lgb
}

results = {}
for name, y_proba in models.items():
    auc, pr_auc, tnr = compute_metrics(y_test, y_proba)
    results[name] = {"ROC AUC": auc, "PR AUC": pr_auc, "TNR": tnr}

# === Display Results ===
results_df = pd.DataFrame(results).T
print("\nModel Comparison:\n")
print(results_df.sort_values("TNR", ascending=False))

# === Plot Comparison ===
plt.figure(figsize=(10, 6))
for name, y_proba in models.items():
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{name}")
plt.title("ROC Curves")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 6))
for name, y_proba in models.items():
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    plt.plot(recall, precision, label=f"{name}")
plt.title("Precision-Recall Curves")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid()
plt.show()

# === Optional: Ensemble ===
ensemble_proba = (y_proba_rf + y_proba_xgb + y_proba_lgb) / 3
ensemble_metrics = compute_metrics(y_test, ensemble_proba)
print("\nEnsemble Performance:")


In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, roc_auc_score, average_precision_score

# --- Function to calculate metrics ---
def calculate_metrics(y_true, y_pred, y_proba):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    tnr = tn / (tn + fp + 1e-10)
    fnr = fn / (fn + tp + 1e-10)
    tpr = recall  # True Positive Rate is equivalent to Recall
    fpr = fp / (fp + tn + 1e-10)
    
    # ROC AUC and PR AUC scores
    roc_auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    
    return {
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'TNR': tnr,
        'FNR': fnr,
        'TPR': tpr,
        'FPR': fpr,
        'ROC AUC': roc_auc,
        'PR AUC': pr_auc
    }

# --- Calculate metrics for each model ---
# Replace `y_pred_*` and `y_proba_*` with predictions and probabilities from your models
# For example, for Random Forest:
y_pred_rf = rf_model.predict(X_test_scaled)
y_proba_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

# Repeat for other models (XGBoost, LGBM):
y_pred_xgb = xgb_model.predict(X_test_scaled)
y_proba_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]

y_pred_lgbm = lgbm_model.predict(X_test_scaled)
y_proba_lgbm = lgbm_model.predict_proba(X_test_scaled)[:, 1]

# --- Store metrics in a dictionary ---
metrics = {
    'RF': calculate_metrics(y_test, y_pred_rf, y_proba_rf),
    'XGB': calculate_metrics(y_test, y_pred_xgb, y_proba_xgb),
    'LGBM': calculate_metrics(y_test, y_pred_lgbm, y_proba_lgbm),
}

# --- Radar Plot ---
labels = list(metrics['RF'].keys())  # Using RF as the reference for label names
models = list(metrics.keys())

angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
angles += angles[:1]  # repeat first angle for closing the plot

fig, ax = plt.subplots(figsize=(8, 6), subplot_kw=dict(polar=True))
for model in models:
    values = [metrics[model][m] for m in labels]
    values += values[:1]  # close the circle
    ax.plot(angles, values, label=model)
    ax.fill(angles, values, alpha=0.1)

ax.set_thetagrids(np.degrees(angles[:-1]), labels)
ax.set_title("Model Comparison Radar Chart", size=14)
ax.grid(True)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.show()

In [None]:
# Function to calculate TNR and FNR at different thresholds
def calculate_tnr_fnr(y_true, y_proba, thresholds):
    tnr_list = []
    fnr_list = []
    
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        
        tnr = tn / (tn + fp + 1e-10)  # True Negative Rate
        fnr = fn / (fn + tp + 1e-10)  # False Negative Rate
        
        tnr_list.append(tnr)
        fnr_list.append(fnr)
    
    return tnr_list, fnr_list

# === Calculate TNR and FNR for different thresholds for each model ===
thresholds = np.linspace(0, 1, 101)

tnr_rf, fnr_rf = calculate_tnr_fnr(y_test, y_proba_rf, thresholds)
tnr_xgb, fnr_xgb = calculate_tnr_fnr(y_test, y_proba_xgb, thresholds)
tnr_lgbm, fnr_lgbm = calculate_tnr_fnr(y_test, y_proba_lgbm, thresholds)

# === Plot TNR and FNR vs Threshold for Each Model ===
plt.figure(figsize=(8, 6))

# Random Forest TNR and FNR
plt.plot(thresholds, tnr_rf, label='RF TNR', color='blue')
plt.plot(thresholds, fnr_rf, label='RF FNR', color='blue', linestyle='--')

# XGBoost TNR and FNR
plt.plot(thresholds, tnr_xgb, label='XGB TNR', color='green')
plt.plot(thresholds, fnr_xgb, label='XGB FNR', color='green', linestyle='--')

# LightGBM TNR and FNR
plt.plot(thresholds, tnr_lgbm, label='LGBM TNR', color='red')
plt.plot(thresholds, fnr_lgbm, label='LGBM FNR', color='red', linestyle='--')

plt.xlabel('Threshold')
plt.ylabel('Rate')
plt.title('TNR and FNR vs. Classification Threshold')
plt.legend()
plt.grid(True)
plt.show()