In [None]:
pip install pandas scikit-learn imbalanced-learn matplotlib seaborn tensorflow



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, UpSampling1D
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import warnings

warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv('creditcard.csv')
X = df.drop('Class', axis=1)
y = df['Class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Sampling strategies
sampling_methods = {
    "RUS": RandomUnderSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "SMOTE_Tomek": SMOTETomek(random_state=42)
}

# PCA extraction
def apply_pca(X_train, n_components=20):
    pca = PCA(n_components=n_components)
    return pca.fit_transform(X_train), pca

# CAE architecture with cropping
def build_cae(input_shape):
    input_layer = Input(shape=input_shape)

    # Encoder
    x = Conv1D(16, 3, activation='relu', padding='same')(input_layer)
    x = MaxPooling1D(2, padding='same')(x)  # → 15
    x = Conv1D(8, 3, activation='relu', padding='same')(x)
    encoded = MaxPooling1D(2, padding='same')(x)  # → 8

    # Decoder
    x = Conv1D(8, 3, activation='relu', padding='same')(encoded)
    x = UpSampling1D(2)(x)  # → 16
    x = Conv1D(16, 3, activation='relu', padding='same')(x)
    x = UpSampling1D(2)(x)  # → 32
    x = Conv1D(1, 3, activation='sigmoid', padding='same')(x)
    decoded = x[:, :input_shape[0], :]  # crop to match input size

    autoencoder = Model(input_layer, decoded)
    encoder = Model(input_layer, encoded)
    autoencoder.compile(optimizer=Adam(), loss='mse')
    return autoencoder, encoder

# Run pipeline
results = []

for sampling_name, sampler in sampling_methods.items():
    print(f"\nProcessing: {sampling_name}")

    # Apply sampling
    X_resampled, y_resampled = sampler.fit_resample(X_scaled, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

    # ===== PCA =====
    X_train_pca, pca = apply_pca(X_train, n_components=20)
    X_test_pca = pca.transform(X_test)

    clf_pca = RandomForestClassifier(random_state=42)
    clf_pca.fit(X_train_pca, y_train)
    y_pred_pca = clf_pca.predict(X_test_pca)
    y_proba_pca = clf_pca.predict_proba(X_test_pca)[:, 1]

    results.append({
        "Method": f"{sampling_name} + PCA",
        "F1": f1_score(y_test, y_pred_pca),
        "AUC": roc_auc_score(y_test, y_proba_pca),
        "AUPRC": average_precision_score(y_test, y_proba_pca)
    })

    # ===== CAE =====
    X_train_cae = np.expand_dims(X_train, axis=2)
    X_test_cae = np.expand_dims(X_test, axis=2)

    cae_model, encoder_model = build_cae(X_train_cae.shape[1:])
    cae_model.fit(X_train_cae, X_train_cae, epochs=10, batch_size=256, shuffle=True, verbose=0)

    X_train_encoded = encoder_model.predict(X_train_cae).reshape(X_train.shape[0], -1)
    X_test_encoded = encoder_model.predict(X_test_cae).reshape(X_test.shape[0], -1)

    clf_cae = RandomForestClassifier(random_state=42)
    clf_cae.fit(X_train_encoded, y_train)
    y_pred_cae = clf_cae.predict(X_test_encoded)
    y_proba_cae = clf_cae.predict_proba(X_test_encoded)[:, 1]

    results.append({
        "Method": f"{sampling_name} + CAE",
        "F1": f1_score(y_test, y_pred_cae),
        "AUC": roc_auc_score(y_test, y_proba_cae),
        "AUPRC": average_precision_score(y_test, y_proba_cae)
    })

# Output results
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df.sort_values(by="F1", ascending=False))



Processing: RUS
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Processing: SMOTE
[1m12439/12439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step
[1m5331/5331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step

Processing: SMOTE_Tomek
[1m12439/12439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step
[1m5331/5331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step

Summary of Results:
              Method        F1       AUC     AUPRC
5  SMOTE_Tomek + CAE  0.999889  0.999987  0.999974
3        SMOTE + CAE  0.999865  0.999998  0.999998
2        SMOTE + PCA  0.999807  0.999998  0.999998
4  SMOTE_Tomek + PCA  0.999807  0.999998  0.999998
0          RUS + PCA  0.930070  0.974201  0.979036
1          RUS + CAE  0.921986  0.973562  0.978216
