In [3]:
=== Best Parameters ===


SyntaxError: invalid syntax (89081168.py, line 1)

In [1]:
# ========================= Imports =========================
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# ========================= Load Dataset =========================
df = pd.read_csv(r"C:\Users\NEVIN\PycharmProjects\DataPreprocessing\Group_ID\data\raw\creditcard.csv")
print("\n=== Dataset Loaded ===")

# ========================= Handle Missing Values =========================
if df.isnull().sum().sum() > 0:
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])

# ========================= Encode Categorical Variables =========================
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])

# ========================= Outlier Handling =========================
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('Class', errors='ignore')
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

# ========================= Feature Engineering =========================
if 'Amount' in df.columns and 'Time' in df.columns:
    df['Amount_per_Time'] = df['Amount'] / (df['Time'] + 1)

# ========================= Feature Selection =========================
X_temp = df.drop('Class', axis=1)
y_temp = df['Class']
mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)
selected_features = mi_series[mi_series > 0].index.tolist()

X = df[selected_features]
y = df['Class']

# ========================= Train/Test Split + Scaling =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================= One-Class SVM =========================
X_train_normal = X_train_scaled[y_train == 0]

# ========================= Subset Normal Data for Faster Training =========================
subset_size = 20000  # train on 20k normal transactions only
X_train_normal_subset = X_train_normal[:subset_size]

# ========================= Optimized Manual Hyperparameter Search =========================
nu_list = [0.005, 0.01, 0.02, 0.05]
gamma_list = ['scale', 0.01, 0.05]

best_auc = 0
best_params = {}

print("\n=== Running Fast Hyperparameter Search (~3-5 mins expected) ===")
for nu in nu_list:
    for gamma in gamma_list:
        ocsvm = OneClassSVM(nu=nu, gamma=gamma, kernel='rbf')
        ocsvm.fit(X_train_normal_subset)  # train on subset
        y_scores = -ocsvm.decision_function(X_test_scaled)
        auc = roc_auc_score(y_test, y_scores)
        print(f"nu={nu}, gamma={gamma} --> ROC-AUC: {auc:.4f}")
        if auc > best_auc:
            best_auc = auc
            best_params = {'nu': nu, 'gamma': gamma}

print("\n=== Best Parameters ===")
print(best_params)
print(f"Best ROC-AUC: {best_auc:.4f}")

# ========================= Train Best Model on Full Normal Subset =========================
best_ocsvm = OneClassSVM(nu=best_params['nu'], gamma=best_params['gamma'], kernel='rbf')
best_ocsvm.fit(X_train_normal_subset)

# ========================= Predictions & Evaluation =========================
y_pred_svm = best_ocsvm.predict(X_test_scaled)
y_pred = np.where(y_pred_svm == -1, 1, 0)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=4))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, -best_ocsvm.decision_function(X_test_scaled))
print(f"\nROC-AUC Score: {roc_auc:.4f}")



=== Dataset Loaded ===

=== Running Fast Hyperparameter Search (~3-5 mins expected) ===
nu=0.005, gamma=scale --> ROC-AUC: 0.9540
nu=0.005, gamma=0.01 --> ROC-AUC: 0.9496
nu=0.005, gamma=0.05 --> ROC-AUC: 0.9518
nu=0.01, gamma=scale --> ROC-AUC: 0.9540
nu=0.01, gamma=0.01 --> ROC-AUC: 0.9533
nu=0.01, gamma=0.05 --> ROC-AUC: 0.9518
nu=0.02, gamma=scale --> ROC-AUC: 0.9548
nu=0.02, gamma=0.01 --> ROC-AUC: 0.9558
nu=0.02, gamma=0.05 --> ROC-AUC: 0.9518
nu=0.05, gamma=scale --> ROC-AUC: 0.9578
nu=0.05, gamma=0.01 --> ROC-AUC: 0.9562
nu=0.05, gamma=0.05 --> ROC-AUC: 0.9545

=== Best Parameters ===
{'nu': 0.05, 'gamma': 'scale'}
Best ROC-AUC: 0.9578

=== Classification Report ===
              precision    recall  f1-score   support

           0     0.9998    0.9435    0.9708     56864
           1     0.0258    0.8673    0.0500        98

    accuracy                         0.9433     56962
   macro avg     0.5128    0.9054    0.5104     56962
weighted avg     0.9981    0.9433    0.9692 