# Transaction Static Feature

## Set up

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import pandas as pd
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

from utils.comparing import report_for_multiple_model

PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data/labeled')

## Load Data

In [3]:
feature_df = pd.read_csv(os.path.join(DATA_PATH, 'transaction_feature.csv')).set_index('Address')
groundtruth_df = pd.read_csv(os.path.join(DATA_PATH, 'groundtruth.csv')).set_index('Address')
feature_cols = [col for col in feature_df.columns]
label_cols = [col for col in groundtruth_df.columns]
merged_df = pd.merge(groundtruth_df, feature_df, left_index=True, right_index=True, how='inner')


## Separate features and labels

In [4]:
X = merged_df[feature_cols]
y = merged_df[label_cols]

## Scale/Normalize Features

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


## Classification report

In [6]:
report_df, X_train, X_test, y_train, y_test = report_for_multiple_model(X_scaled, y)

[LightGBM] [Info] Number of positive: 14, number of negative: 41
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 342
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.254545 -> initscore=-1.074515
[LightGBM] [Info] Start training from score -1.074515
[LightGBM] [Info] Number of positive: 6, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 342
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.109091 -> initscore=-2.100061
[LightGBM] [Info] Start training from score -2.100061
[LightGBM] [Info] Number of posit

In [7]:
report_df.sort_values(by='macro avg f1', ascending=False)

Unnamed: 0,micro avg f1,macro avg f1,Mint f1,Leak f1,Limit f1
OneVsRest(GaussianNB),0.62963,0.607814,0.6,0.461538,0.761905
MultiOutput(GaussianNB),0.62963,0.607814,0.6,0.461538,0.761905
MultiOutput(LogisticRegression),0.608696,0.590058,0.533333,0.5,0.736842
OneVsRest(LogisticRegression),0.608696,0.590058,0.533333,0.5,0.736842
OneVsRest(RandomForest),0.555556,0.425926,0.0,0.5,0.777778
MultiOutput(RandomForest),0.555556,0.425926,0.0,0.5,0.777778
MultiOutput(MLP),0.190476,0.155844,0.285714,0.0,0.181818
OneVsRest(MLP),0.190476,0.155844,0.285714,0.0,0.181818
OneVsRest(XGBoost),0.105263,0.095238,0.285714,0.0,0.0
MultiOutput(XGBoost),0.105263,0.095238,0.285714,0.0,0.0


## Tuning

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV

# Create the base classifier
base_model = GaussianNB()

# Wrap with OneVsRest
ovr = OneVsRestClassifier(base_model)

# Define the parameter grid
param_grid = {
    'estimator__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

# Grid Search
grid = GridSearchCV(ovr, param_grid, cv=5, scoring='f1_micro')  # or 'f1_macro', 'roc_auc', etc.
grid.fit(X_train, y_train)

# Best result
print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


Best params: {'estimator__var_smoothing': 1e-09}
Best score: 0.4551571739807033


In [9]:
best_params_clean = {
    k.replace("estimator__", ""): v for k, v in grid.best_params_.items()
}

base_model = GaussianNB(**best_params_clean)

final_model = OneVsRestClassifier(base_model)
final_model.fit(X_train, y_train)

In [10]:
y_pred = final_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      1.00      0.60         6
           1       0.30      1.00      0.46         3
           2       0.67      0.89      0.76         9

   micro avg       0.47      0.94      0.63        18
   macro avg       0.47      0.96      0.61        18
weighted avg       0.53      0.94      0.66        18
 samples avg       0.48      0.68      0.53        18

