<a href="https://colab.research.google.com/github/madhav2348/web3_fraud_detection/blob/main/web3_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!pip install kaggle



In [None]:
!kaggle datasets download -d chusmman/synthetic-cryptoweb3-transaction-dataset
!unzip synthetic-cryptoweb3-transaction-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/chusmman/synthetic-cryptoweb3-transaction-dataset
License(s): CC0-1.0
Downloading synthetic-cryptoweb3-transaction-dataset.zip to /content
  0% 0.00/51.4M [00:00<?, ?B/s]
100% 51.4M/51.4M [00:00<00:00, 1.08GB/s]
Archive:  synthetic-cryptoweb3-transaction-dataset.zip
  inflating: crypto_transactions.csv  


In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import joblib
from datetime import datetime
import os

# Config
CSV = "crypto_transactions.csv"
ARTIFACT_DIR = "artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
ANOMALY_OUTPUT = "anomalies_review.csv"

In [None]:
# Load dataset
df = pd.read_csv(CSV)
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
# Feature Engineering Function
def featurize(df_in, fit_encoders=None):
    df = df_in.copy()

    # Extract time-based features
    df['year']  = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day']   = df['timestamp'].dt.day
    df['hour']  = df['timestamp'].dt.hour

    # Categorical and numerical columns
    cat_cols = ['from_wallet','to_wallet','token','platform','tx_type']
    num_cols = ['amount','gas_fee_usd','year','month','day','hour']

    # Encode categories into numbers
    if fit_encoders is None:
        enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        enc.fit(df[cat_cols])
    else:
        enc = fit_encoders

    cat_encoded = enc.transform(df[cat_cols])
    cat_encoded_df = pd.DataFrame(cat_encoded, columns=cat_cols, index=df.index)

    # Combine numerical + encoded categorical features
    X = pd.concat([df[num_cols].reset_index(drop=True),
                   cat_encoded_df.reset_index(drop=True)], axis=1)
    return X, enc, num_cols + cat_cols

In [None]:
# Prepare Training Data
X, encoder, feature_cols = featurize(df)

# Scale features so models work better
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Train Models

In [None]:
# Isolation Forest
iso = IsolationForest(n_estimators=200, contamination=0.01, random_state=42)
iso.fit(X_scaled)
iso_labels = iso.predict(X_scaled)
iso_scores = iso.decision_function(X_scaled)

In [None]:
# Local Outlier Factor (novelty=True allows later prediction)
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01, novelty=True)
lof.fit(X_scaled)
lof_labels = lof.predict(X_scaled)
lof_scores = lof.decision_function(X_scaled)

In [None]:
df['iso_anomaly'] = iso_labels
df['lof_anomaly'] = lof_labels
df['iso_score'] = iso_scores
df['lof_score'] = lof_scores

#### Visualization

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
plt.hist(df[df['iso_anomaly'] == 1]['amount'], bins=50, alpha=0.6, label='Normal')
plt.hist(df[df['iso_anomaly'] == -1]['amount'], bins=50, alpha=0.6, label='Anomaly')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Amounts (Isolation Forest)')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df['amount'], df['gas_fee_usd'], c=(df['iso_anomaly'] == -1), cmap='coolwarm', alpha=0.5)
plt.xlabel('Amount')
plt.ylabel('Gas Fee (USD)')
plt.title('Amount vs Gas Fee — Red = Anomalies')
plt.colorbar(label='Anomaly Flag')
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(6,6))
sns.heatmap(pd.crosstab(df['iso_anomaly'], df['lof_anomaly']), annot=True, fmt='d', cmap='Blues')
plt.xlabel('LOF Anomaly')
plt.ylabel('Isolation Forest Anomaly')
plt.title('Model Agreement Matrix')
plt.show()


In [None]:
# Save model for later

joblib.dump(encoder, os.path.join(ARTIFACT_DIR, "ordinal_encoder.joblib"))
joblib.dump(scaler, os.path.join(ARTIFACT_DIR, "scaler.joblib"))
joblib.dump(iso, os.path.join(ARTIFACT_DIR, "isolation_forest.joblib"))
joblib.dump(lof, os.path.join(ARTIFACT_DIR, "lof_novelty.joblib"))

In [None]:
# Real Time Transcation process
def process_transaction(tx: dict,
                        encoder_path=os.path.join(ARTIFACT_DIR, "ordinal_encoder.joblib"),
                        scaler_path=os.path.join(ARTIFACT_DIR, "scaler.joblib"),
                        iso_path=os.path.join(ARTIFACT_DIR, "isolation_forest.joblib"),
                        lof_path=os.path.join(ARTIFACT_DIR, "lof_novelty.joblib"),
                        output_csv=ANOMALY_OUTPUT,
                        anomaly_threshold=None):

    # Load saved models
    enc = joblib.load(encoder_path)
    scaler = joblib.load(scaler_path)
    iso = joblib.load(iso_path)
    lof = joblib.load(lof_path)

    # Convert to DataFrame and extract features
    tx_df = pd.DataFrame([tx])
    tx_df['timestamp'] = pd.to_datetime(tx_df['timestamp'])
    X_tx, _, cols = featurize(tx_df, fit_encoders=enc)
    X_tx_scaled = scaler.transform(X_tx)

    # Predict with both models
    iso_label = iso.predict(X_tx_scaled)[0]
    iso_score = iso.decision_function(X_tx_scaled)[0]
    lof_label = lof.predict(X_tx_scaled)[0]
    lof_score = lof.decision_function(X_tx_scaled)[0]

    # Combine results
    combined_score = (iso_score + lof_score) / 2.0
    flagged = (iso_label == -1) or (lof_label == -1)
    if anomaly_threshold is not None:
        flagged = flagged or (combined_score < anomaly_threshold)

    # Save flagged tx for review
    if flagged:
        out_df = pd.DataFrame([{**tx,
                                "iso_label": int(iso_label),
                                "iso_score": float(iso_score),
                                "lof_label": int(lof_label),
                                "lof_score": float(lof_score),
                                "combined_score": float(combined_score),
                                "flagged": True}])
        header = not os.path.exists(output_csv)
        out_df.to_csv(output_csv, mode='a', index=False, header=header)

    return {
        "iso_label": iso_label,
        "iso_score": iso_score,
        "lof_label": lof_label,
        "lof_score": lof_score,
        "combined_score": combined_score,
        "flagged": flagged
    }


### Example

In [None]:

example_results = []
for i, row in df.head(5).iterrows():
    tx = {
        "tx_hash": row['tx_hash'],
        "from_wallet": row['from_wallet'],
        "to_wallet": row['to_wallet'],
        "token": row['token'],
        "amount": float(row['amount']),
        "timestamp": row['timestamp'].isoformat(),
        "gas_fee_usd": float(row['gas_fee_usd']),
        "platform": row['platform'],
        "tx_type": row['tx_type']
    }
    res = process_transaction(tx)
    example_results.append(res)

pd.DataFrame(example_results)
