# Elliptic AML Dataset Exploration
Initial exploration and visualization of the dataset.

In [None]:
import os, json, numpy as np, pandas as pd
from pathlib import Path
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

DATA_RAW = Path("../data/elliptic_bitcoin_dataset")
DATA_INTERIM = Path("../data/interim")      
DATA_PROCESSED = Path("../data/processed")  # for modelling
for p in [DATA_INTERIM, DATA_PROCESSED]: p.mkdir(parents=True, exist_ok=True)

### 2. 加载与完整性校验
校验三件套：elliptic_txs_features.csv, elliptic_txs_edgelist.csv, elliptic_txs_classes.csv 
检查：形状、重复 txId、空值、取值域；edges_df 的节点是否都在 classes_df 里

In [None]:
features = pd.read_csv(DATA_RAW/"elliptic_txs_features.csv", header=None)
edges    = pd.read_csv(DATA_RAW/"elliptic_txs_edgelist.csv")
classes  = pd.read_csv(DATA_RAW/"elliptic_txs_classes.csv")

assert features.shape[0] == classes.shape[0]
assert classes['txId'].is_unique

# 空值/重复
nulls = features.isna().sum().sum()
dups_edges = edges.duplicated().sum()
print(f"NULLs in features: {nulls}, duplicate edges: {dups_edges}")

# 边的端点是否都在类表中
assert set(edges['txId1']).issubset(set(classes['txId']))
assert set(edges['txId2']).issubset(set(classes['txId']))

### 3. 标签清洗与映射
	•	建议：保留三类问题两套数据：
A) 二分类（丢弃 unknown）→ 给监督学习
B) 半监督/异常检测（unknown 保留）→ 给 One-Class/IF/GNN 半监督
	•	建议映射：{'illicit':1, 'licit':0}，unknown 先丢或单独存

In [None]:
# 合并 txId
features['txId'] = classes['txId']

label_map = {'1':1, 1:1, '2':0, 2:0, 'unknown':np.nan}
y_raw = classes['class'].map(label_map)

mask_sup = y_raw.notna()
y_sup = y_raw[mask_sup].astype(int)

# 仅监督学习子集
feat_sup = features.loc[mask_sup].reset_index(drop=True)
y_sup = y_sup.reset_index(drop=True)

### 4. 基础特征工程与清洗
	•	丢掉常数/近零方差特征
	•	可选：对高缺失/极端值做处理（你的 features 通常无缺失）
	•	共线筛选（>0.98 的相关性只保留其一）

In [None]:
# 除 txId 外的原始数值列
num_cols = [c for c in feat_sup.columns if c not in ['txId']]

# 常数列
nunique = feat_sup[num_cols].nunique()
keep = nunique[nunique > 1].index.tolist()
feat_sup = feat_sup[['txId'] + keep]

# 高相关去重
corr = feat_sup[keep].corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.98)]
feat_sup = feat_sup.drop(columns=to_drop)
print(f"dropped {len(to_drop)} highly correlated columns")

### 5. 时序信息与切分

Elliptic 数据自带 time step（通常在 features 第 0 列），建议基于时间切分：
	•	训练：前 70% 的时间步
	•	验证：中间 10%
	•	测试：最后 20%

In [None]:
# 通常 features 第0列为 timestep（请确认你的版本）
timestep_col = 0
df = feat_sup.copy()
df['timestep'] = df[timestep_col]

# 对齐标签
df['label'] = y_sup

# 排序后切分
df = df.sort_values('timestep').reset_index(drop=True)
ts = df['timestep'].values

t1 = np.quantile(ts, 0.7)
t2 = np.quantile(ts, 0.8)

train_idx = df['timestep'] <= t1
valid_idx = (df['timestep'] > t1) & (df['timestep'] <= t2)
test_idx  = df['timestep'] > t2

train, valid, test = df[train_idx], df[valid_idx], df[test_idx]
print(train.shape, valid.shape, test.shape)

### 6. 标准化/缩放（仅在训练集拟合）（Code）
	•	强烈建议用 sklearn 的 Pipeline 保存 scaler，并只用训练集拟合

In [None]:
from sklearn.preprocessing import StandardScaler

feature_cols = [c for c in df.columns if c not in ['txId','label','timestep']]
scaler = StandardScaler()

X_train = train[feature_cols].copy()
X_valid = valid[feature_cols].copy()
X_test  = test[feature_cols].copy()

scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_valid_s = scaler.transform(X_valid)
X_test_s  = scaler.transform(X_test)

y_train = train['label'].values
y_valid = valid['label'].values
y_test  = test['label'].values

7. 类别不平衡处理（思路）
	•	基线：模型里用 class_weight='balanced'
	•	采样：如果要用 SMOTE/Under-sampling，只在训练集做
	•	替代：Focal Loss（在后续 03_modeling/深度模型中再上）

在预处理阶段先不做采样，避免把选择“写死”到数据层；把采样放到建模 Pipeline 更灵活。


把可训练矩阵与对照表、scaler 参数、列名、切分边界全部存起来。

In [None]:
import joblib

# 保存矩阵（推荐 parquet 或 npy）
pd.DataFrame(X_train_s, columns=feature_cols).to_parquet(DATA_PROCESSED/"X_train.parquet")
pd.DataFrame(X_valid_s, columns=feature_cols).to_parquet(DATA_PROCESSED/"X_valid.parquet")
pd.DataFrame(X_test_s,  columns=feature_cols).to_parquet(DATA_PROCESSED/"X_test.parquet")

pd.Series(y_train).to_frame("label").to_parquet(DATA_PROCESSED/"y_train.parquet")
pd.Series(y_valid).to_frame("label").to_parquet(DATA_PROCESSED/"y_valid.parquet")
pd.Series(y_test).to_frame("label").to_parquet(DATA_PROCESSED/"y_test.parquet")

# 保存切分用到的 txId，方便做图/回溯
train[['txId','timestep']].to_parquet(DATA_INTERIM/"train_ids.parquet")
valid[['txId','timestep']].to_parquet(DATA_INTERIM/"valid_ids.parquet")
test[['txId','timestep']].to_parquet(DATA_INTERIM/"test_ids.parquet")

# 保存 scaler 与元数据
joblib.dump(scaler, DATA_PROCESSED/"scaler.joblib")
meta = {
    "random_seed": RANDOM_SEED,
    "dropped_high_corr": to_drop,
    "feature_cols": feature_cols,
    "timestep_quantiles": {"train<=q0.7": float(t1), "valid<=q0.8": float(t2)},
    "samples": {"train": int(len(train)), "valid": int(len(valid)), "test": int(len(test))}
}
with open(DATA_PROCESSED/"metadata.json", "w") as f:
    json.dump(meta, f, indent=2)

9. 质量闸（QA）与可复现性
	•	检查各 split 标签比例
	•	检查没有数据泄漏（验证/测试与训练的时间边界）
	•	固定随机种子；记录依赖版本（pip freeze > requirements.txt）

In [None]:
for name, y in [("train", y_train), ("valid", y_valid), ("test", y_test)]:
    unique, counts = np.unique(y, return_counts=True)
    print(name, dict(zip(unique, counts)))