In [1]:
# ========================= Imports =========================
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# ========================= Load Dataset =========================
df = pd.read_csv(r"C:\Users\NEVIN\PycharmProjects\DataPreprocessing\Group_ID\data\raw\creditcard.csv")
print("\n=== Dataset Loaded ===")

# ========================= Handle Missing Values =========================
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

# ========================= Encode Categorical Variables =========================
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])

# ========================= Outlier Handling =========================
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('Class', errors='ignore')
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

# ========================= Feature Engineering =========================
if 'Amount' in df.columns and 'Time' in df.columns:
    df['Amount_per_Time'] = df['Amount'] / (df['Time'] + 1)

# ========================= Feature Selection =========================
X_temp = df.drop('Class', axis=1)
y_temp = df['Class']
mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)
selected_features = mi_series[mi_series > 0].index.tolist()

X = df[selected_features]
y = df['Class']

# ========================= Train/Test Split + Scaling =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================= Subset for Hyperparameter Tuning =========================
X_train_normal = X_train_scaled[y_train == 0]  # only normal samples
subset_size = min(60000, len(X_train_normal))
X_train_subset = X_train_normal[:subset_size]

# ========================= Hyperparameter Tuning =========================
param_dist = {
    'n_estimators': [100, 150, 200, 250],
    'max_samples': [0.6, 0.7, 0.8, 0.9, 1.0],
    'contamination': [y_train.sum() / len(y_train)],  # actual fraud fraction
    'max_features': [0.6, 0.7, 0.8, 0.9, 1.0]
}

iso = IsolationForest(random_state=42, n_jobs=-1)

iso_random = RandomizedSearchCV(
    estimator=iso,
    param_distributions=param_dist,
    n_iter=10,          # test 10 random combinations
    scoring='roc_auc',
    cv=2,               # 2-fold CV
    random_state=42,
    verbose=1,
    n_jobs=-1
)

print("\n=== Running Hyperparameter Tuning (~10-15 mins) ===")
iso_random.fit(X_train_subset, np.zeros(len(X_train_subset)))  # unsupervised labels

print("\n=== Best Parameters ===")
print(iso_random.best_params_)

# ========================= Train Best Model on Full Training Data =========================
best_iso = iso_random.best_estimator_
best_iso.fit(X_train_scaled)

# ========================= Predictions & Evaluation =========================
y_pred_if = best_iso.predict(X_test_scaled)
y_pred = np.where(y_pred_if == -1, 1, 0)  # convert to 0=non-fraud, 1=fraud

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=4))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

y_scores = -best_iso.decision_function(X_test_scaled)
roc_auc = roc_auc_score(y_test, y_scores)
print(f"\nROC-AUC Score: {roc_auc:.4f}")



=== Dataset Loaded ===

=== Running Hyperparameter Tuning (~10-15 mins) ===
Fitting 2 folds for each of 10 candidates, totalling 20 fits





=== Best Parameters ===
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 1.0, 'contamination': np.float64(0.001729245759178389)}

=== Classification Report ===
              precision    recall  f1-score   support

           0     0.9986    0.9983    0.9985     56864
           1     0.1754    0.2041    0.1887        98

    accuracy                         0.9970     56962
   macro avg     0.5870    0.6012    0.5936     56962
weighted avg     0.9972    0.9970    0.9971     56962


=== Confusion Matrix ===
[[56770    94]
 [   78    20]]

ROC-AUC Score: 0.9571
