In [None]:
import pandas as pd

df = pd.read_csv('labels_transactionsagg.csv')

df = df.rename(columns={'label': 'category'})

risky_categories = [
    'mixer', 'gambling', 'darknet', 'ponzi', 'scam', 'blacklist', 'malware', 'ransomware', 'illegal'
]

# Create a label column: 1 for risky, 0 for non-risky
df['label'] = df['category'].apply(lambda x: 1 if str(x).lower() in risky_categories else 0)

# Drop other columns
df = df[['address', 'category', 'label']].dropna()

# Save to new CSV
df.to_csv('wallets_labeled.csv', index=False)

print(df.head())


                              address       category  label
0  1129yN1itF5a5Hqgw7aenPKrom3fv8Aid1  coinjoin-like      0
1  112b8ZjaJWGobxT7rAE8Qyw17bJa91jaec       gambling      1
2  112LhEwGZk5mqpLz9Q33QG5FxZNbpgDwTL       exchange      0
3  1136GYGTdySKCocdjqZphXiW4zoskXHqML       gambling      1
4  113Nu2g1dE4d3oiUH1ozUysK2RBgC98pxx          miner      0


Extracting from normal dataset

In [None]:
import os
import csv
import time
from blockcypher import get_address_details
from dotenv import load_dotenv

# Load API key
load_dotenv()
API_token= os.getenv("BLOCKCYPHER_API_KEY")

wallets_df = pd.read_csv('wallets_labeled.csv')
output_file = 'api_extracted_wallet_features.csv'

columns = [
    'address', 'balance', 'total_received', 'total_sent', 'n_tx', 'final_balance', 'unconfirmed_balance', 'final_n_tx',
    'num_txrefs', 'total_confirmations', 'avg_tx_value', 'label', 'category'
]

if os.path.exists(output_file):
    with open(output_file, 'r') as f:
        file_lines = sum(1 for line in f)
    processed_count = max(0, file_lines - 1)
    resume_from = processed_count + 1
    print(f"Resuming from address row {resume_from + 1}")
else:
    processed_count = 0
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(columns)
    print("Starting fresh")

batch_df = wallets_df.iloc[resume_from:]

def extract_features(address):
    while True:
        try:
            data = get_address_details(address, api_key=API_token)

            balance = data.get('balance', 0)
            received = data.get('total_received', 0)
            sent = data.get('total_sent', 0)
            tx_count = data.get('n_tx', 0)
            final_balance = data.get('final_balance', 0)
            unconfirmed_balance = data.get('unconfirmed_balance', 0)
            final_n_tx = data.get('final_n_tx', 0)

            txrefs = data.get('txrefs', [])
            num_txrefs = len(txrefs)
            total_confirmations = sum(tx.get('confirmations', 0) for tx in txrefs)
            avg_tx_value = sum(tx.get('value', 0) for tx in txrefs) / num_txrefs if num_txrefs > 0 else 0

            return [
                balance, received, sent, tx_count,
                final_balance, unconfirmed_balance, final_n_tx,
                num_txrefs, total_confirmations, avg_tx_value
            ]

        except Exception as e:
            if '429' in str(e):
                print("Max rate limit reached...")
                time.sleep(120)
                continue
            else:
                print(f"Failed for {address}: {e}")
                return [0]*10


with open(output_file, 'a', newline='') as file:
    writer = csv.writer(file)

    for idx, row in batch_df.iterrows():
        address = row['address']
        label = row['label']
        category = row['category']

        print(f"[{idx + 1}] Processing {address}")
        features = extract_features(address)
        writer.writerow([address] + features + [label, category])
        file.flush()
        time.sleep(1)

print(f"Completed batch of {len(batch_df)} addresses.")



Resuming from address row 5949
[5949] Processing 1PoG76dsBxa1ezijijXzsLLovdRh2e3sD4
[5950] Processing 1poLkQ6bfYek3y75i19wu7Uqxk2c9sZfk
[5951] Processing 1PorSy4F9KReWCzuSkqaqip8tDdCvM9B28


KeyboardInterrupt: 

Extracting from suspicious dataset

In [3]:
output_file2 = 'api_extracted_suspicious_features.csv'
suspicious_df = pd.read_csv('btc_wallets_data.csv')
suspicious_df['label'] = 1
suspicious_df['category'] = 'suspicious'

existing_cols = ['address', 'total_received', 'total_sent', 'n_tx', 'final_balance']
api_cols = ['balance', 'unconfirmed_balance', 'final_n_tx', 'num_txrefs', 'total_confirmations', 'avg_tx_value']
all_columns = ['address'] + api_cols + existing_cols[1:] + ['label', 'category']

# Handle resuming
if os.path.exists(output_file2):
    with open(output_file2, 'r') as f2:
        file_lines = sum(1 for line in f2)
    resume_index = max(0, file_lines - 1)
    print(f"Resuming from suspicious row {resume_index + 1}")
else:
    resume_index = 0
    with open(output_file2, 'w', newline='') as f2:
        writer = csv.writer(f2)
        writer.writerow(all_columns)
    print("Starting new suspicious extraction file.")

# Filter for unprocessed addresses
remaining_df = suspicious_df
batch_df = remaining_df.iloc[resume_index:]

def extract_api_features(address):
    while True:
        try:
            data = get_address_details(address, api_key=API_token)
            txrefs = data.get('txrefs', [])
            num_txrefs = len(txrefs)
            total_confirmations = sum(tx.get('confirmations', 0) for tx in txrefs)
            avg_tx_value = sum(tx.get('value', 0) for tx in txrefs) / num_txrefs if num_txrefs > 0 else 0
            return [
                data.get('balance', 0),
                data.get('unconfirmed_balance', 0),
                data.get('final_n_tx', 0),
                num_txrefs,
                total_confirmations,
                avg_tx_value
            ]
        except Exception as e:
            if '429' in str(e):
                print("Max rate limit reached...")
                time.sleep(120)
                continue
            else:
                print(f"Failed for {address}: {e}")
                return [0] * 6

with open(output_file2, 'a', newline='') as f:
    writer = csv.writer(f)

    for idx, row in batch_df.iterrows():
        address = row['address']
        print(f"[{idx + 1}] Processing {address}")

        api_data = extract_api_features(address)
        existing_data = [
            row.get('total_received', 0),
            row.get('total_sent', 0),
            row.get('n_tx', 0),
            row.get('final_balance', 0)
        ]
        writer.writerow([address] + api_data + existing_data + [row['label'], row['category']])
        f.flush()
        time.sleep(1)

print("Suspicious wallet feature extraction complete.")


Resuming from suspicious row 5685
Suspicious wallet feature extraction complete.


Clean out empty rows and merge datasets

In [4]:
import pandas as pd

normal_file = 'api_extracted_wallet_features.csv'
suspicious_file = 'api_extracted_suspicious_features.csv'

normal_df = pd.read_csv(normal_file)
suspicious_df = pd.read_csv(suspicious_file)

numeric_cols = [
    'balance', 'total_received', 'total_sent', 'n_tx', 'final_balance', 'unconfirmed_balance', 'final_n_tx',
    'num_txrefs', 'total_confirmations', 'avg_tx_value'
]

normal_df = normal_df[~(normal_df[numeric_cols] == 0).all(axis=1)]
suspicious_df = suspicious_df[~(suspicious_df[numeric_cols] == 0).all(axis=1)]

# Match column order
suspicious_df = suspicious_df[normal_df.columns]

# Merge both
combined_df = pd.concat([normal_df, suspicious_df], ignore_index=True)
# Drop duplicates
combined_df = combined_df.drop_duplicates(subset='address')
combined_df.to_csv('combined_wallet_features.csv', index=False)

total_rows = len(combined_df)
num_risky = combined_df['label'].sum()
num_nonrisky = total_rows - num_risky

print(f"Combined dataset saved: {total_rows} rows after removing zero-feature addresses.")
print(f"Risky addresses: {num_risky}")
print(f"Non-risky addresses: {num_nonrisky}")

Combined dataset saved: 10022 rows after removing zero-feature addresses.
Risky addresses: 4976
Non-risky addresses: 5046


Train model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import joblib

df = pd.read_csv("combined_wallet_features.csv")
df = df.drop(columns=["address", "category"], errors="ignore")
df = df.dropna()

X = df.drop(columns=["label"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Define models to compare
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

best_score = 0
best_model = None
best_name = ""
# Train and evaluate
for name, model in models.items():
    print(f"\n\t--- {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    macro_f1 = report["macro avg"]["f1-score"]  
    print(classification_report(y_test, y_pred))

    print(f"Macro F1-Score for {name}: {macro_f1:.4f}")

    if macro_f1 > best_score:
        best_score = macro_f1
        best_model = model
        best_name = name

print(f"\nSaved Best Model: {best_name} with Macro F1-Score = {best_score:.4f}")
joblib.dump(best_model, "wallet_risk_model.pkl")

# Show each feature's importance
if hasattr(best_model, "feature_importances_"):
    importance = best_model.feature_importances_
    features = X.columns
    sorted_idx = importance.argsort()[::-1]

    print("\nFeature Importances:")
    for i in sorted_idx:
        print(f"{features[i]}: {importance[i]:.4f}")


	--- Random Forest ---
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1010
           1       0.96      0.92      0.94       995

    accuracy                           0.94      2005
   macro avg       0.94      0.94      0.94      2005
weighted avg       0.94      0.94      0.94      2005

Macro F1-Score for Random Forest: 0.9401

	--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.53      0.98      0.69      1010
           1       0.85      0.10      0.18       995

    accuracy                           0.55      2005
   macro avg       0.69      0.54      0.43      2005
weighted avg       0.69      0.55      0.44      2005

Macro F1-Score for Logistic Regression: 0.4332

	--- K-Nearest Neighbors ---
              precision    recall  f1-score   support

           0       0.76      0.88      0.81      1010
           1       0.85      0.72      0.78       995

    accura

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Clustering

In [15]:
from sklearn.cluster import KMeans

df = pd.read_csv("combined_wallet_features.csv")
df = df.drop(columns=["address", "category"], errors="ignore")
df = df.dropna()

# Extract features and label
X = df.drop(columns=["label"])
y = df["label"]

# Add clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df["cluster"] = kmeans.fit_predict(X)

# Analyze cluster riskiness
cluster_risk_summary = df.groupby("cluster")["label"].value_counts(normalize=True).unstack().fillna(0)
print(cluster_risk_summary)

joblib.dump(kmeans, 'kmeans_cluster_model.pkl')

label           0         1
cluster                    
0        0.502803  0.497197
1        1.000000  0.000000
2        1.000000  0.000000
3        1.000000  0.000000
4        0.692308  0.307692


['kmeans_cluster_model.pkl']