In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
from tqdm import tqdm


df = pd.read_csv("/content/realistic_credit_card_transactions (1).csv")


df["Transaction_Date"] = pd.to_datetime(df["Transaction_Date"])


df["Transaction_Hour"] = df["Transaction_Date"].dt.hour
df["Transaction_Day"] = df["Transaction_Date"].dt.dayofweek
df["Transaction_Month"] = df["Transaction_Date"].dt.month


df["Merchant_Name"] = df["Merchant_Name"].astype("category").cat.codes

df["Transaction_Amount"] = StandardScaler().fit_transform(df["Transaction_Amount"].values.reshape(-1, 1))


df["Transaction_Frequency"] = df.groupby("Customer_ID")["Transaction_ID"].transform("count")
df["Location_Mismatch"] = df.groupby("Customer_ID")["Transaction_Location"].transform(lambda x: x != x.mode()[0])


encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_categories = encoder.fit_transform(df[["Merchant_Category", "Transaction_Location"]])
encoded_categories_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(["Merchant_Category", "Transaction_Location"]))
df = pd.concat([df, encoded_categories_df], axis=1)


X = df.drop(columns=["Fraud_Flag", "Transaction_Date", "Transaction_ID", "Customer_ID", "Merchant_Name", "Merchant_Category", "Transaction_Location"])
y = df["Fraud_Flag"]


X = X.select_dtypes(include=[np.number])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


iso_forest = IsolationForest(contamination=0.01, random_state=42)
iso_forest.fit(X_train)


oc_svm = OneClassSVM(nu=0.01, kernel="rbf", gamma="auto")
oc_svm.fit(X_train)


input_dim = X_train.shape[1]
autoencoder = Sequential([
    Dense(16, activation="relu", input_shape=(input_dim,)),
    Dense(8, activation="relu"),
    Dense(16, activation="relu"),
    Dense(input_dim, activation="sigmoid")
])
autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_test, X_test), verbose=1)

joblib.dump(iso_forest, "isolation_forest.pkl")
joblib.dump(oc_svm, "one_class_svm.pkl")
autoencoder.save("autoencoder_model.h5")


def detect_fraud_with_reason(transaction):
    """Detects fraud using trained models and returns real-world reasons and model reasons."""
    transaction = np.array(transaction).reshape(1, -1)


    iso_forest = joblib.load("isolation_forest.pkl")
    oc_svm = joblib.load("one_class_svm.pkl")
    autoencoder = load_model("autoencoder_model.h5", custom_objects={"mse": MeanSquaredError()})


    iso_pred = iso_forest.predict(transaction)[0]  # -1 for fraud
    svm_pred = oc_svm.predict(transaction)[0]  # -1 for fraud
    auto_pred = np.mean((autoencoder.predict(transaction) - transaction) ** 2)  # MSE threshold

    # real-world reasons for fraud
    real_world_reasons = []
    if transaction[0][3] > 3:  # Unusual Transaction Amount (scaled amount > 3)
        real_world_reasons.append("Unusual Transaction Amount")
    if transaction[0][6] == 1:  # Location Mismatch (1 = mismatch)
        real_world_reasons.append("Geographic Anomaly")
    if transaction[0][7] > 5:  # High-Frequency Transactions (more than 5 in a short time)
        real_world_reasons.append("High-Frequency Transactions")
    if transaction[0][4] == "Unusual Merchant":  # Unusual Merchant Category
        real_world_reasons.append("Unusual Merchant Category")
    if transaction[0][2] < 6 or transaction[0][2] > 22:  # Unusual Time of Day (late night/early morning)
        real_world_reasons.append("Unusual Time of Day")

    # model reasons for fraud
    model_reasons = []
    if iso_pred == -1:
        model_reasons.append("Isolation Forest flagged as anomaly")
    if svm_pred == -1:
        model_reasons.append("One-Class SVM flagged as anomaly")
    if auto_pred > 0.01:
        model_reasons.append("Autoencoder detected high reconstruction error")

    # If at least two models flag it as fraud, mark it as fraudulent
    if (iso_pred == -1 and svm_pred == -1) or auto_pred > 0.01:
        return [1, ", ".join(real_world_reasons), ", ".join(model_reasons)]  # Fraud with reasons
    else:
        return [0, "", ""]  # Legitimate


def process_row(row):
    """Process a single row for fraud detection."""
    row_numeric = row.astype(float)  # Convert to numeric
    return detect_fraud_with_reason(row_numeric)


results = Parallel(n_jobs=-1)(delayed(process_row)(row) for _, row in tqdm(X.iterrows(), total=len(X), desc="Processing Rows"))


fraud_predictions = [result[0] for result in results]
real_world_reasons = [result[1] for result in results]
model_reasons = [result[2] for result in results]

# Step 13: Add Results to the DataFrame
df["Fraud_Prediction"] = fraud_predictions
df["Real_World_Reason"] = real_world_reasons
df["Model_Reason"] = model_reasons

#save cases
fraud_cases = df[df["Fraud_Prediction"] == 1]
fraud_cases.to_csv("fraud_cases_with_reasons.csv", index=False)

print(f"Fraud cases detected and saved to fraud_cases_with_reasons.csv ({len(fraud_cases)} cases)")


#download saved cases file
try:
    from google.colab import files
    files.download("fraud_cases_with_reasons.csv")
    print(" File ready for download!")
except:
    print(" Run this in Colab for auto-download, or check fraud_cases_with_reasons.csv in your directory.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 3945.7920 - val_loss: 3953.9412
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 3943.5535 - val_loss: 3953.9414
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 3943.8987 - val_loss: 3953.9407
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3947.6506 - val_loss: 3953.9407
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3943.8638 - val_loss: 3953.9407
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3945.1687 - val_loss: 3953.9407
Epoch 7/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3945.0833 - val_loss: 3953.9407
Epoch 8/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3941.0662 - val_loss: 39

Processing Rows: 100%|██████████| 10000/10000 [39:16<00:00,  4.24it/s]


✅ Fraud cases detected and saved to fraud_cases_with_reasons.csv (10000 cases)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 File ready for download!
