# TF-IDF Static Feature

## Set up

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import pandas as pd
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

from utils.comparing import report_for_multiple_model

PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data/labeled')

## Load Data

In [3]:
feature_df = pd.read_csv(os.path.join(DATA_PATH, 'tf_idf.csv')).set_index('Address')
groundtruth_df = pd.read_csv(os.path.join(DATA_PATH, 'groundtruth.csv')).set_index('Address')
feature_cols = [col for col in feature_df.columns]
label_cols = [col for col in groundtruth_df.columns]
merged_df = pd.merge(groundtruth_df, feature_df, left_index=True, right_index=True, how='inner')


## Separate features and labels

In [4]:
X = merged_df[feature_cols]
y = merged_df[label_cols]

## Scale/Normalize Features

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


## Classification report

In [6]:
report_df, X_train, X_test, y_train, y_test = report_for_multiple_model(X_scaled, y)

[LightGBM] [Info] Number of positive: 15, number of negative: 39
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5504
[LightGBM] [Info] Number of data points in the train set: 54, number of used features: 409
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.277778 -> initscore=-0.955511
[LightGBM] [Info] Start training from score -0.955511
[LightGBM] [Info] Number of positive: 8, number of negative: 46
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5504
[LightGBM] [Info] Number of data points in the train set: 54, number of used features: 409
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.148148 -> initscore=-1.749200
[LightGBM] [Info] Start training from score -1.749200
[LightGBM] [Info] Number of p

In [7]:
report_df.sort_values(by='macro avg f1', ascending=False)

Unnamed: 0,micro avg f1,macro avg f1,Mint f1,Leak f1,Limit f1
MultiOutput(LogisticRegression),0.72,0.766667,0.5,1.0,0.8
OneVsRest(LogisticRegression),0.72,0.766667,0.5,1.0,0.8
OneVsRest(AdaBoost),0.695652,0.75641,0.5,1.0,0.769231
MultiOutput(AdaBoost),0.695652,0.75641,0.5,1.0,0.769231
MultiOutput(XGBoost),0.56,0.671795,0.4,1.0,0.615385
OneVsRest(XGBoost),0.56,0.671795,0.4,1.0,0.615385
OneVsRest(ExtraTrees),0.608696,0.666667,0.285714,1.0,0.714286
MultiOutput(ExtraTrees),0.608696,0.666667,0.285714,1.0,0.714286
MultiOutput(GaussianNB),0.692308,0.645299,0.666667,0.5,0.769231
OneVsRest(GaussianNB),0.692308,0.645299,0.666667,0.5,0.769231


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

# Base model
base_model = LogisticRegression(max_iter=500, class_weight='balanced', random_state=42)

# Wrap with MultiOutput
multi_model = MultiOutputClassifier(base_model)

# Define the grid
param_grid = {
    'estimator__C': [0.01, 0.1, 1, 10],                   # Regularization strength
    'estimator__penalty': ['l2'],                         # 'l1' only with 'liblinear' or 'saga'
    'estimator__solver': ['lbfgs', 'saga'],               # 'saga' supports l1, elasticnet
    'estimator__tol': [1e-4, 1e-3],                       # convergence tolerance
}

grid = GridSearchCV(
    multi_model,
    param_grid,
    scoring='f1_macro',  # You can try 'f1_samples' or others
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1



[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   0.4s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.0s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.0s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   1.2s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=s



[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   1.5s




[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   1.9s
[CV] END estimator__C=0.01, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   1.9s




[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   0.9s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   0.6s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   1.8s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s




[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.0s




[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.3s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.2s




[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.1s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.1s
[CV] END estimator__C=0.1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.3s




[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.2s




[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.7s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.8s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.5s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.4s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.5s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator_



[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.0001; total time=   0.1s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.0s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.5s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=lbfgs, estimator__tol=0.001; total time=   0.1s




[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.6s
[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.8s




[CV] END estimator__C=1, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.7s




[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.6s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.1s




[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.3s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.4s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.4s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.3s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.0001; total time=   2.3s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.5s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.2s
[CV] END estimator__C=10, estimator__penalty=l2, estimator__solver=saga, estimator__tol=0.001; total time=   1.2s
Best Params: {'estimator__C': 0.01, 'estimator__penalty': 'l2', 'estimator__solver'

In [9]:
# Strip prefix 'estimator__'
best_params_clean = {k.replace("estimator__", ""): v for k, v in grid.best_params_.items()}

# Rebuild and refit final model
final_model = MultiOutputClassifier(LogisticRegression(**best_params_clean, max_iter=500, class_weight='balanced', random_state=42))
final_model.fit(X_train, y_train)


In [10]:
y_pred = final_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.36      1.00      0.53         4
           1       0.09      1.00      0.17         1
           2       0.62      0.83      0.71         6

   micro avg       0.33      0.91      0.49        11
   macro avg       0.36      0.94      0.47        11
weighted avg       0.48      0.91      0.60        11
 samples avg       0.35      0.57      0.40        11

