**CREDIT CARD FRAUD**

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve
from xgboost import XGBClassifier

# 🚀 Load Data
df = pd.read_csv('creditcard.csv')

# ✅ Data Preprocessing
df.drop_duplicates(inplace=True)  # Remove duplicates
df.drop(columns=['Time'], inplace=True)  # Drop 'Time' as it's not useful

# 🚀 Split Features & Target
X = df.drop(columns=['Class'])
y = df['Class']

# ✅ Train-Test Split (Stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ✅ Compute Fraud-to-Non-Fraud Ratio
fraud_ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# 🔹 Optimized XGBoost Hyperparameters (No SMOTE)
param_grid_xgb = {
    'n_estimators': [300],  # More trees for stability
    'max_depth': [4, 6, 8],  # Prevent overfitting while allowing learning
    'learning_rate': [0.01, 0.05, 0.1],  # Balanced learning rate
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.7, 0.8],
    'scale_pos_weight': [fraud_ratio * 0.5, fraud_ratio, fraud_ratio * 1.5]  # Tuning weight for imbalance
}

# ✅ Initialize XGBoost Model
xgb = XGBClassifier(tree_method='hist', random_state=42)

# 🔍 Grid Search for Best Parameters
grid_search_xgb = RandomizedSearchCV(xgb, param_distributions=param_grid_xgb,
                                     cv=3, scoring='f1', n_iter=10, n_jobs=-1, random_state=42)
grid_search_xgb.fit(X_train, y_train)

# ✅ Best Model from Grid Search
best_xgb = grid_search_xgb.best_estimator_

# 🚀 Predict Probabilities
y_pred_proba_xgb = best_xgb.predict_proba(X_test)[:, 1]

# ✅ Adjust Decision Threshold Using Precision-Recall Curve
precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba_xgb)

# Find the best threshold where Precision & Recall are balanced
best_index = (precisions * recalls).argmax()
best_threshold = thresholds[best_index]

# 🚀 Fine-tune: Lower the threshold slightly to improve recall
adjusted_threshold = max(0.35, best_threshold - 0.06)  # Ensuring it doesn’t go too low
print(f"Optimal Threshold (Adjusted): {adjusted_threshold}")

# ✅ Apply New Threshold
y_pred_xgb_adj = (y_pred_proba_xgb > adjusted_threshold).astype(int)


# 📌 Model Performance
print("📌 Fine-Tuned XGBoost Performance (Without SMOTE):")
print(classification_report(y_test, y_pred_xgb_adj))
print(confusion_matrix(y_test, y_pred_xgb_adj))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb_adj))


Optimal Threshold (Adjusted): 0.4350561201572418
📌 Fine-Tuned XGBoost Performance (Without SMOTE):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.94      0.79      0.86        95

    accuracy                           1.00     56746
   macro avg       0.97      0.89      0.93     56746
weighted avg       1.00      1.00      1.00     56746

[[56646     5]
 [   20    75]]
Accuracy: 0.9995594403129736
