In [None]:
import pandas as pd
df_wallets_features_classes_combined = pd.read_csv('/kaggle/input/pretrain1/wallets_features_classes_combined.csv')

In [None]:
#Lọc bỏ timestep và các rows trùng nhau
df_wallets_classification = df_wallets_features_classes_combined
df_wallets_classification = df_wallets_classification.drop(columns=['Time step']).drop_duplicates()
df_wallets_classification

In [None]:
# Bỏ unknown (class == 3) - we're classifying only 2 class (1 & 2)
data = df_wallets_classification.loc[(df_wallets_classification['class'] != 3), 'address']
df_wallets_feature_selected = df_wallets_classification.loc[df_wallets_classification['address'].isin(data)]
df_wallets_feature_selected
# Cuz the actor 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone 
import xgboost as xgb

In [None]:
# Goal: binary classification of 0,1
# 0: licit, 1: illicit

# change illicit (class-2) to '0' for classification
y = df_wallets_feature_selected[['class']]
y = y['class'].apply(lambda x: 0 if x == 2 else 1 ) 

X_train, X_test, y_train, y_test = train_test_split(df_wallets_feature_selected,y,test_size=0.30,random_state=15, shuffle=False)
X_train = X_train.drop(columns=['address', 'class'])
X_test = X_test.drop(columns=['address', 'class'])

In [None]:
scaler = MinMaxScaler()

# Fit scaler chỉ trên X_train
X_train_scaled = scaler.fit_transform(X_train)

# Apply (transform) lên X_test bằng scaler đã fit từ X_train
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
X_train_scaled

In [None]:
# Đảm bảo y là mảng 1 chiều
y_train = y_train.values.ravel() if hasattr(y_train, 'values') else y_train
y_test = y_test.values.ravel() if hasattr(y_test, 'values') else y_test

## Train model again

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, f1_score
from lightgbm import LGBMClassifier

In [None]:
def evaluate_model(model_name, y_true, y_pred):
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred)
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    print(f"\n 📌 {model_name}")
    print("Precision: %.3f" % prec[1])
    print("Recall: %.3f" % rec[1])
    print("F1 Score: %.3f" % f1[1])
    print("Micro-Average F1: %.3f" % micro_f1)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
evaluate_model("Random Forest", y_test, y_pred_rf)

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_scaled, y_train)
y_pred_xgb = xgb.predict(X_test_scaled)
evaluate_model("XGBoost", y_test, y_pred_xgb)

# Use best model for feature selection

## Feature importance

In [None]:
#RF & XGB là 2 model tốt nhất, XGB có recall cao hơn và ensemble RF + XGB > RF => Feature selection sử dụng ensemble
#feature importance RF & XGB
rf_importances = rf.feature_importances_
xgb_importances = xgb.feature_importances_

In [None]:
#  DataFrame with RF_Imp & XGB_Imp (Importance of RF & XGB)
imp_df = pd.DataFrame({
    'Feature': X_train_scaled.columns if hasattr(X_train_scaled, 'columns') else [f'Feature_{i}' for i in range(X_train_scaled.shape[1])],
    'RF_Imp': rf_importances,
    'XGB_Imp': xgb_importances,
})

# Ensemble - importance count using imp of rf + imp of xgb then divide 2
imp_df['Imp'] = (imp_df['RF_Imp'] + imp_df['XGB_Imp']) / 2

# Ensemble - importance max
imp_df['Imp_max'] = imp_df[['RF_Imp', 'XGB_Imp']].max(axis=1)
imp_df_sorted = imp_df.sort_values(by="Imp", ascending=False)
imp_df_max_sorted = imp_df.sort_values(by="Imp_max", ascending=False)

In [None]:
imp_df_sorted.head(20) #imp sorted mean

In [None]:
imp_df_max_sorted.head(20)


## Lấy top & bot 20 feature theo Feature importance theo max & mean

In [None]:
#FI mean
top20_fi = imp_df_sorted.head(20)['Feature'].tolist()
bottom20_fi = imp_df_sorted.tail(20)['Feature'].tolist()

# FI max
top20_fi_max = imp_df_max_sorted.head(20)['Feature'].tolist()
bottom20_fi_max = imp_df_max_sorted.tail(20)['Feature'].tolist()

# Tương tự với PFI, DCFI sau khi tính
# top20_pfi, bottom20_pfi, top20_dcfi, bottom20_dcfi

## PFI (Permutation feature importance)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
import pandas as pd
import matplotlib.pyplot as plt


# 1. Tính PFI riêng cho RF
perm_rf = PermutationImportance(rf, random_state=42).fit(X_test_scaled, y_test)
pfi_rf = eli5.explain_weights_df(perm_rf, feature_names=X_test_scaled.columns.tolist())
pfi_rf = pfi_rf[['feature', 'weight']].rename(columns={'weight': 'weight_rf'})

# 2. Tính PFI riêng cho XGB
perm_xgb = PermutationImportance(xgb, random_state=42).fit(X_test_scaled, y_test)
pfi_xgb = eli5.explain_weights_df(perm_xgb, feature_names=X_test_scaled.columns.tolist())
pfi_xgb = pfi_xgb[['feature', 'weight']].rename(columns={'weight': 'weight_xgb'})

# 3. Gộp điểm PFI từ 2 model theo feature
pfi_merged = pd.merge(pfi_rf, pfi_xgb, on='feature')

# 4. Tính mean và max điểm PFI
pfi_merged['weight_mean'] = (pfi_merged['weight_rf'] + pfi_merged['weight_xgb']) / 2
pfi_merged['weight_max'] = pfi_merged[['weight_rf', 'weight_xgb']].max(axis=1)

# 5. Sắp xếp và lấy top 20, bottom 20 theo mean
top20_pfi_mean = pfi_merged.sort_values('weight_mean', ascending=False).head(20)['feature'].tolist()
bottom20_pfi_mean = pfi_merged.sort_values('weight_mean', ascending=True).head(20)['feature'].tolist()

# 6. Sắp xếp và lấy top 20, bottom 20 theo max
top20_pfi_max = pfi_merged.sort_values('weight_max', ascending=False).head(20)['feature'].tolist()
bottom20_pfi_max = pfi_merged.sort_values('weight_max', ascending=True).head(20)['feature'].tolist()

## DCFI (Drop Column Feature Importance)

In [None]:
from sklearn.base import clone
import pandas as pd

def compute_dcfi(model, X_train, y_train, random_state=42):
    # clone model để tránh ảnh hưởng bên ngoài
    model_clone = clone(model)
    if hasattr(model_clone, 'random_state'):
        model_clone.random_state = random_state
    
    # Nếu là DataFrame thì lấy .values, nếu là numpy thì giữ nguyên
    X_train_array = X_train.values if hasattr(X_train, 'values') else X_train
    y_train_array = y_train.values if hasattr(y_train, 'values') else y_train
    
    # train trên full features
    model_clone.fit(X_train_array, y_train_array)
    benchmark_score = model_clone.score(X_train_array, y_train_array)
    
    importances = []
    for col in range(X_train_array.shape[1]):
        model_clone = clone(model)
        if hasattr(model_clone, 'random_state'):
            model_clone.random_state = random_state
        
        # Drop column col: nếu DataFrame thì drop theo tên, nếu numpy thì drop theo index
        if hasattr(X_train, 'columns'):
            X_subset = X_train.drop(X_train.columns[col], axis=1)
            X_subset_array = X_subset.values
        else:
            X_subset_array = np.delete(X_train_array, col, axis=1)
        
        model_clone.fit(X_subset_array, y_train_array)
        drop_col_score = model_clone.score(X_subset_array, y_train_array)
        importances.append(benchmark_score - drop_col_score)
    
    # Lấy tên feature nếu có, hoặc tạo tên giả
    if hasattr(X_train, 'columns'):
        features = X_train.columns
    else:
        features = [f'Feature_{i}' for i in range(X_train_array.shape[1])]
    
    dcfi_df = pd.DataFrame({
        'Feature': features,
        'Imp': importances
    }).sort_values('Imp', ascending=False).reset_index(drop=True)
    
    return dcfi_df


# Tính DCFI cho RF và XGB
dcfi_rf = compute_dcfi(rf, X_train_scaled, y_train)
dcfi_xgb = compute_dcfi(xgb, X_train_scaled, y_train)

# Gộp vào một DataFrame chung
dcfi_df = pd.DataFrame({
    'Feature': X_train_scaled.columns,
    'RF_Imp': dcfi_rf['Imp'].values,
    'XGB_Imp': dcfi_xgb['Imp'].values,
})

# Tổng hợp mean và max
dcfi_df['Imp_mean'] = dcfi_df[['RF_Imp', 'XGB_Imp']].mean(axis=1)
dcfi_df['Imp_max'] = dcfi_df[['RF_Imp', 'XGB_Imp']].max(axis=1)

# Sắp xếp theo mean hoặc max tùy mục đích
dcfi_df_sorted_mean = dcfi_df.sort_values('Imp_mean', ascending=False).reset_index(drop=True)
dcfi_df_sorted_max = dcfi_df.sort_values('Imp_max', ascending=False).reset_index(drop=True)

print("Top features theo DCFI mean:")
print(dcfi_df_sorted_mean.head(20))

print("\nTop features theo DCFI max:")
print(dcfi_df_sorted_max.head(20))


In [None]:
from sklearn.preprocessing import MinMaxScaler

# 1. FI
fi_mean_df = imp_df_sorted[['Feature', 'Imp']].rename(columns={'Imp': 'FI_Mean'})
fi_max_df = imp_df_max_sorted[['Feature', 'Imp']].rename(columns={'Imp': 'FI_Max'})

# 2. PFI
pfi_mean_df = pfi_merged[['feature', 'weight_mean']].rename(columns={'feature': 'Feature', 'weight_mean': 'PFI_Mean'})
pfi_max_df = pfi_merged[['feature', 'weight_max']].rename(columns={'feature': 'Feature', 'weight_max': 'PFI_Max'})

# 3. DCFI
dcfi_mean_df = dcfi_df[['Feature', 'Imp_mean']].rename(columns={'Imp_mean': 'DCFI_Mean'})
dcfi_max_df = dcfi_df[['Feature', 'Imp_max']].rename(columns={'Imp_max': 'DCFI_Max'})

# Merge toàn bộ
merged = fi_mean_df.merge(pfi_mean_df, on='Feature').merge(dcfi_mean_df, on='Feature')
merged_max = fi_max_df.merge(pfi_max_df, on='Feature').merge(dcfi_max_df, on='Feature')

# Chuẩn hóa
scaler = MinMaxScaler()
merged[['FI_Mean', 'PFI_Mean', 'DCFI_Mean']] = scaler.fit_transform(merged[['FI_Mean', 'PFI_Mean', 'DCFI_Mean']])
merged_max[['FI_Max', 'PFI_Max', 'DCFI_Max']] = scaler.fit_transform(merged_max[['FI_Max', 'PFI_Max', 'DCFI_Max']])


In [None]:
# Mean-based tổng hợp
merged['Mean_Score'] = merged[['FI_Mean', 'PFI_Mean', 'DCFI_Mean']].mean(axis=1)
merged_max['Max_Score'] = merged_max[['FI_Max', 'PFI_Max', 'DCFI_Max']].max(axis=1)


In [None]:
# Top và bottom 20 theo trung bình
top20_mean = merged.sort_values('Mean_Score', ascending=False).head(20)['Feature'].tolist()
bottom20_mean = merged.sort_values('Mean_Score', ascending=True).head(20)['Feature'].tolist()

# Top và bottom 20 theo max
top20_max = merged_max.sort_values('Max_Score', ascending=False).head(20)['Feature'].tolist()
bottom20_max = merged_max.sort_values('Max_Score', ascending=True).head(20)['Feature'].tolist()
