In [3]:
# ========================= Imports =========================
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# ========================= Load Dataset =========================
df = pd.read_csv(r"C:\Users\NEVIN\PycharmProjects\DataPreprocessing\Group_ID\data\raw\creditcard.csv")
print("\n=== Dataset Loaded ===")

# ========================= Handle Missing Values =========================
if df.isnull().sum().sum() > 0:
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])

# ========================= Encode Categorical Variables =========================
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])

# ========================= Outlier Handling =========================
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('Class', errors='ignore')
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

# ========================= Feature Engineering =========================
if 'Amount' in df.columns and 'Time' in df.columns:
    df['Amount_per_Time'] = df['Amount'] / (df['Time'] + 1)

# ========================= Feature Selection (Mutual Information) =========================
X_temp = df.drop('Class', axis=1)
y_temp = df['Class']
mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)
selected_features = mi_series[mi_series > 0].index.tolist()

X = df[selected_features]
y = df['Class']

# ========================= Train/Test Split + Scaling =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================= Apply SMOTE on Training Set =========================
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

# ========================= FAST SUBSET FOR TUNING =========================
# Use only 10% of training data to find best params
sample_size = int(0.1 * len(X_train_res))
X_tune = X_train_res[:sample_size]
y_tune = y_train_res[:sample_size]

# ========================= Random Forest Hyperparameter Tuning =========================
rf = RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1)

param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=5,               # Try only 5 random combos
    scoring='roc_auc',
    cv=2,                   # 2-fold CV (faster)
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("\n=== Running FAST Hyperparameter Tuning (on 10% sample) ===")
rf_random.fit(X_tune, y_tune)

print("\n=== Best Parameters Found ===")
print(rf_random.best_params_)

# Retrain best model on full resampled dataset
best_rf = rf_random.best_estimator_
best_rf.fit(X_train_res, y_train_res)

# ========================= Model Evaluation =========================
y_pred = best_rf.predict(X_test_scaled)
y_pred_prob = best_rf.predict_proba(X_test_scaled)[:, 1]

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=4))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"\nROC-AUC Score: {roc_auc:.4f}")

# ========================= Feature Importance =========================
feature_importance = pd.Series(best_rf.feature_importances_, index=selected_features).sort_values(ascending=False)
print("\n=== Random Forest Feature Importance ===")
print(feature_importance)



=== Dataset Loaded ===

=== Running FAST Hyperparameter Tuning (on 10% sample) ===
Fitting 2 folds for each of 5 candidates, totalling 10 fits

=== Best Parameters Found ===
{'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}

=== Classification Report ===
              precision    recall  f1-score   support

           0     0.9997    0.9996    0.9996     56864
           1     0.7843    0.8163    0.8000        98

    accuracy                         0.9993     56962
   macro avg     0.8920    0.9080    0.8998     56962
weighted avg     0.9993    0.9993    0.9993     56962


=== Confusion Matrix ===
[[56842    22]
 [   18    80]]

ROC-AUC Score: 0.9724

=== Random Forest Feature Importance ===
V14                0.194232
V10                0.117677
V12                0.103545
V4                 0.095357
V17                0.071955
V11                0.071320
V3                 0.052374
V16                0.044462
V7         