In [5]:
# ========================= Imports =========================
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import seaborn as sns

# ========================= Load Dataset =========================
df = pd.read_csv(r"C:\Users\NEVIN\PycharmProjects\DataPreprocessing\Group_ID\data\raw\creditcard.csv")
print("\n=== Dataset Loaded ===")

# ========================= Handle Missing Values =========================
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

# ========================= Encode Categorical Variables =========================
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# ========================= Outlier Handling =========================
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('Class', errors='ignore')
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

# ========================= Feature Engineering =========================
if 'Amount' in df.columns and 'Time' in df.columns:
    df['Amount_per_Time'] = df['Amount'] / (df['Time'] + 1)

# ========================= Feature Selection =========================
X_temp = df.drop('Class', axis=1)
y_temp = df['Class']
mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)
selected_features = mi_series[mi_series > 0].index.tolist()

X = df[selected_features]
y = df['Class']

# ========================= Train/Test Split + Scaling =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================= Train only on normal transactions =========================
X_train_normal = X_train_scaled[y_train == 0]
input_dim = X_train_normal.shape[1]

# ========================= Manual Hyperparameter Tuning =========================
param_grid = [
    {'hidden1': 16, 'hidden2': 8, 'lr': 0.001, 'batch_size': 256},
    {'hidden1': 32, 'hidden2': 16, 'lr': 0.001, 'batch_size': 256},
    {'hidden1': 16, 'hidden2': 8, 'lr': 0.005, 'batch_size': 128},
]

best_threshold = None
best_roc = 0
best_model = None

for params in param_grid:
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(params['hidden1'], activation='relu')(input_layer)
    encoded = Dense(params['hidden2'], activation='relu')(encoded)
    decoded = Dense(params['hidden1'], activation='relu')(encoded)
    decoded = Dense(input_dim, activation='linear')(decoded)

    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer=Adam(learning_rate=params['lr']), loss='mse')

    autoencoder.fit(
        X_train_normal, X_train_normal,
        epochs=20,
        batch_size=params['batch_size'],
        shuffle=True,
        validation_split=0.1,
        verbose=0
    )

    # Compute reconstruction error on test
    X_test_pred = autoencoder.predict(X_test_scaled)
    mse = np.mean(np.power(X_test_scaled - X_test_pred, 2), axis=1)

    # Threshold from normal training data
    X_train_pred = autoencoder.predict(X_train_normal)
    mse_train = np.mean(np.power(X_train_normal - X_train_pred, 2), axis=1)
    threshold = mse_train.mean() + 3 * mse_train.std()

    # Predicted labels
    y_pred = (mse > threshold).astype(int)

    # Evaluate ROC-AUC
    roc = roc_auc_score(y_test, mse)
    print(f"Params: {params}, ROC-AUC: {roc:.4f}")

    if roc > best_roc:
        best_roc = roc
        best_threshold = threshold
        best_model = autoencoder

# ========================= Final Evaluation =========================
X_test_pred = best_model.predict(X_test_scaled)
mse = np.mean(np.power(X_test_scaled - X_test_pred, 2), axis=1)
y_pred = (mse > best_threshold).astype(int)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=4))
print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
roc_auc = roc_auc_score(y_test, mse)
print(f"\nROC-AUC Score: {roc_auc:.4f}")

# ========================= Additional Testing & Evaluation =========================

# ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, mse)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, mse)
pr_auc = auc(recall, precision)
plt.figure(figsize=(8,6))
plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

# MSE Distribution
plt.figure(figsize=(8,6))
sns.histplot(mse[y_test==0], color='green', label='Normal', kde=True, stat='density')
sns.histplot(mse[y_test==1], color='red', label='Fraud', kde=True, stat='density')
plt.axvline(best_threshold, color='blue', linestyle='--', label='Threshold')
plt.xlabel('Reconstruction Error (MSE)')
plt.ylabel('Density')
plt.title('Reconstruction Error Distribution')
plt.legend()
plt.show()


AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import)