In [2]:
# ========================= Imports =========================
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import mutual_info_classif
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# ========================= Load Dataset =========================
df = pd.read_csv(r"C:\Users\NEVIN\PycharmProjects\DataPreprocessing\Group_ID\data\raw\creditcard.csv")
print("\n=== Dataset Loaded ===")

# ========================= Handle Missing Values =========================
if df.isnull().sum().sum() > 0:
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])

# ========================= Encode Categorical Variables =========================
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])

# ========================= Outlier Handling =========================
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('Class', errors='ignore')
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

# ========================= Feature Engineering =========================
if 'Amount' in df.columns and 'Time' in df.columns:
    df['Amount_per_Time'] = df['Amount'] / (df['Time'] + 1)

# ========================= Feature Selection (Mutual Information) =========================
X_temp = df.drop('Class', axis=1)
y_temp = df['Class']
mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)
selected_features = mi_series[mi_series > 0].index.tolist()

X = df[selected_features]
y = df['Class']

# ========================= Train/Test Split + Scaling =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================= Compute scale_pos_weight =========================
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

# ========================= FAST SUBSET FOR HYPERPARAMETER TUNING =========================
sample_size = int(0.1 * len(X_train_scaled))
X_tune = X_train_scaled[:sample_size]
y_tune = y_train[:sample_size]

# ========================= Hyperparameter Tuning =========================
xgb = XGBClassifier(
    random_state=42,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    n_jobs=-1
)

param_dist = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 6],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

xgb_random = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=5,               # ultra-fast
    scoring='roc_auc',
    cv=2,                   # 2-fold CV
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("\n=== Running FAST Hyperparameter Tuning (on 10% sample) ===")
xgb_random.fit(X_tune, y_tune)

print("\n=== Best Parameters Found ===")
print(xgb_random.best_params_)

# ========================= Retrain Final Model on Full Training Data =========================
best_xgb = xgb_random.best_estimator_
best_xgb.fit(X_train_scaled, y_train)

# ========================= Predictions & Evaluation =========================
y_pred = best_xgb.predict(X_test_scaled)
y_pred_prob = best_xgb.predict_proba(X_test_scaled)[:, 1]

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=4))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"\nROC-AUC Score: {roc_auc:.4f}")

# ========================= Feature Importance =========================
feature_importance = pd.Series(best_xgb.feature_importances_, index=selected_features).sort_values(ascending=False)
print("\n=== XGBoost Feature Importance ===")
print(feature_importance)



=== Dataset Loaded ===

=== Running FAST Hyperparameter Tuning (on 10% sample) ===
Fitting 2 folds for each of 5 candidates, totalling 10 fits

=== Best Parameters Found ===
{'subsample': 0.8, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.6}

=== Classification Report ===
              precision    recall  f1-score   support

           0     0.9997    0.9994    0.9996     56864
           1     0.7094    0.8469    0.7721        98

    accuracy                         0.9991     56962
   macro avg     0.8546    0.9232    0.8858     56962
weighted avg     0.9992    0.9991    0.9992     56962


=== Confusion Matrix ===
[[56830    34]
 [   15    83]]

ROC-AUC Score: 0.9796

=== XGBoost Feature Importance ===
V14                0.327924
V4                 0.150161
V10                0.070812
Amount             0.032151
V11                0.030215
V17                0.029575
V8                 0.029288
V20                0.026409
V12      