# Baseline Model using TF-IDF for binary

## Set up

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score


In [3]:
PATH = Path.cwd().parents[2]
DATA_PATH = os.path.join(PATH, 'data/processed/tf_idf')

## List Traditional ML model to compare

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

models = {
  "LogisticRegression()": LogisticRegression(),
  "DecisionTreeClassifier()": DecisionTreeClassifier(),
  "RandomForestClassifier()": RandomForestClassifier(),
  "AdaBoostClassifier()": AdaBoostClassifier(),
  "ExtraTreesClassifier()": ExtraTreesClassifier(),
  "XGBClassifier()": XGBClassifier(),
  "LGBMClassifier()": LGBMClassifier(),
  "SVC()": SVC(),
  "GaussianNB()": GaussianNB(),
  "KNeighborsClassifier()": KNeighborsClassifier(),
  "SGDClassifier()": SGDClassifier(),
  "MLPClassifier()": MLPClassifier(),
}

## Load Dataset

In [5]:
X = load_npz(os.path.join(DATA_PATH, "tfidf_vector_hex.npz"))
y = np.load(os.path.join(DATA_PATH, "labels_hex.npy"))

In [6]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3133 stored elements and shape (69, 78)>

In [7]:
y[:, 0]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0])

In [8]:
y[:, 1]

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [9]:
y[:, 2]

array([0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0])

## Prepare Features and Labels

In [10]:
y_mint = y[:, 0]
y_leak = y[:, 1]
y_limit = y[:, 2]

## Train/Test Split

In [11]:
X_mint_train, X_mint_test, y_mint_train, y_mint_test = train_test_split(
    X, y_mint, test_size=0.2, random_state=42
)

X_leak_train, X_leak_test, y_leak_train, y_leak_test = train_test_split(
    X, y_leak, test_size=0.2, random_state=42
)

X_limit_train, X_limit_test, y_limit_train, y_limit_test = train_test_split(
    X, y_limit, test_size=0.2, random_state=42
)

## Run all models and collect reports

## Show report

In [12]:
data = {
  'mint': {
    "X_train": X_mint_train,
    "X_test": X_mint_test,
    "y_train": y_mint_train,
    "y_test": y_mint_test
  },
  'leak': {
    "X_train": X_leak_train,
    "X_test": X_leak_test,
    "y_train": y_leak_train,
    "y_test": y_leak_test
  },
  'limit': {
    "X_train": X_limit_train,
    "X_test": X_limit_test,
    "y_train": y_limit_train,
    "y_test": y_limit_test
  }
}

reports = []

for d in data.values():
    X_train = d['X_train']
    X_test = d['X_test']
    y_train = d['y_train']
    y_test = d['y_test']

    # Convert to dense for models that require it
    def maybe_dense(model, X):
        # List of models that require dense input
        dense_models = (GaussianNB, MLPClassifier)
        if isinstance(model, dense_models):
            return X.toarray()
        return X

    def get_report_all_ml_dense(X_train, y_train, X_test, y_test):
        report_list = []
        for name, model in models.items():
            X_train_mod = maybe_dense(model, X_train)
            X_test_mod = maybe_dense(model, X_test)
            model.fit(X_train_mod, y_train)
            y_pred = model.predict(X_test_mod)
            report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
            avg_scores = report_dict["macro avg"]
            report_list.append({
                "Model": name,
                "Precision": avg_scores["precision"],
                "Recall": avg_scores["recall"],
                "F1-score": avg_scores["f1-score"]
            })
        df_report = pd.DataFrame(report_list)
        df_report = df_report.sort_values("F1-score", ascending=False).reset_index(drop=True)
        return df_report

    df_report = get_report_all_ml_dense(X_train, y_train, X_test, y_test)
    reports.append(df_report)

[LightGBM] [Info] Number of positive: 14, number of negative: 41
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043028 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 849
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.254545 -> initscore=-1.074515
[LightGBM] [Info] Start training from score -1.074515
[LightGBM] [Info] Number of positive: 7, number of negative: 48
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 849
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.127273 -> initscore=-1.925291
[LightGBM] [Info] Start training from score -1.925291
[LightGBM] [Info] Number of posit

In [13]:
reports[0]

Unnamed: 0,Model,Precision,Recall,F1-score
0,AdaBoostClassifier(),0.9,0.833333,0.844444
1,KNeighborsClassifier(),0.788889,0.770833,0.775401
2,RandomForestClassifier(),0.863636,0.75,0.754386
3,DecisionTreeClassifier(),0.863636,0.75,0.754386
4,ExtraTreesClassifier(),0.863636,0.75,0.754386
5,GaussianNB(),0.729167,0.729167,0.714286
6,LGBMClassifier(),0.725,0.6875,0.688889
7,XGBClassifier(),0.725,0.6875,0.688889
8,SGDClassifier(),0.651515,0.604167,0.590643
9,LogisticRegression(),0.285714,0.5,0.363636


In [14]:
reports[1]

Unnamed: 0,Model,Precision,Recall,F1-score
0,AdaBoostClassifier(),0.961538,0.75,0.813333
1,LogisticRegression(),0.428571,0.5,0.461538
2,DecisionTreeClassifier(),0.428571,0.5,0.461538
3,RandomForestClassifier(),0.428571,0.5,0.461538
4,ExtraTreesClassifier(),0.428571,0.5,0.461538
5,XGBClassifier(),0.428571,0.5,0.461538
6,LGBMClassifier(),0.428571,0.5,0.461538
7,SVC(),0.428571,0.5,0.461538
8,KNeighborsClassifier(),0.428571,0.5,0.461538
9,SGDClassifier(),0.428571,0.5,0.461538


In [15]:
reports[2]

Unnamed: 0,Model,Precision,Recall,F1-score
0,AdaBoostClassifier(),0.875,0.875,0.857143
1,ExtraTreesClassifier(),0.854167,0.854167,0.854167
2,LGBMClassifier(),0.785714,0.791667,0.784615
3,RandomForestClassifier(),0.785714,0.791667,0.784615
4,XGBClassifier(),0.729167,0.729167,0.714286
5,KNeighborsClassifier(),0.729167,0.729167,0.714286
6,MLPClassifier(),0.708333,0.708333,0.708333
7,GaussianNB(),0.642857,0.645833,0.641026
8,DecisionTreeClassifier(),0.522222,0.520833,0.497436
9,LogisticRegression(),0.285714,0.5,0.363636


## MLP

### Build model

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import MeanSquaredError

2025-07-15 19:02:57.028092: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
def Model(input_dim, output_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(512, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.4),

        Dense(256, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.3),

        Dense(128, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.2),

        Dense(output_dim, activation='sigmoid')  # sigmoid for multi-label
    ])

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=1e-6),
        metrics=['accuracy']
    )
    return model

In [18]:
model = Model(input_dim=X.shape[1], output_dim=1)

### Train

In [19]:
model.fit(X_mint_train, y_mint_train, validation_split=0.2, epochs=100, batch_size=32,
                    callbacks=[
                                EarlyStopping(monitor='val_loss',
                                             patience=5,
                                             restore_best_weights=True),
                                ReduceLROnPlateau(
                                  monitor='val_loss',
                                  factor=0.5,
                                  patience=5,
                                  verbose=1)
                               ]
                    )

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 396ms/step - accuracy: 0.5256 - loss: 0.8227 - val_accuracy: 0.7273 - val_loss: 0.7024 - learning_rate: 1.0000e-06
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 0.5758 - loss: 0.7046 - val_accuracy: 0.8182 - val_loss: 0.6980 - learning_rate: 1.0000e-06
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 0.6174 - loss: 0.6437 - val_accuracy: 0.8182 - val_loss: 0.6944 - learning_rate: 1.0000e-06
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.6686 - loss: 0.7246 - val_accuracy: 0.8182 - val_loss: 0.6907 - learning_rate: 1.0000e-06
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.5663 - loss: 0.7612 - val_accuracy: 0.8182 - val_loss: 0.6893 - learning_rate: 1.0000e-06
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x13c308350>

### Predict

In [20]:
y_mint_test_prob = model.predict(X_mint_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381ms/step


In [21]:
def tune_thresholds(y_true, y_pred_prob, metric='f1'):
    y_true = np.asarray(y_true)          # Fix: convert to NumPy
    y_pred_prob = np.asarray(y_pred_prob)

    best_thresholds = []
    best_scores = []

    for i in range(y_true.shape[1]):
        label_true = y_true[:, i]
        label_probs = y_pred_prob[:, i]  # Fix here too

        thresholds = np.linspace(0.0, 1.0, 101)
        scores = []

        for t in thresholds:
            label_pred = (label_probs >= t).astype(int)
            if metric == 'f1':
                score = f1_score(label_true, label_pred, zero_division=0)
            scores.append(score)

        best_t = thresholds[np.argmax(scores)]
        best_score = np.max(scores)

        best_thresholds.append(best_t)
        best_scores.append(best_score)

        print(f"Label {i}: Best threshold = {best_t:.2f}, Best {metric} = {best_score:.4f}")

    return best_thresholds, best_scores


In [22]:
# Reshape to (n_samples, 1) for binary classification
best_thresholds, _ = tune_thresholds(y_mint_test.reshape(-1, 1), y_mint_test_prob.reshape(-1, 1))

Label 0: Best threshold = 0.00, Best f1 = 0.6000


In [23]:
y_mint_pred = (model.predict(X_mint_test) >= best_thresholds).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step


### Report

In [24]:
print(classification_report(y_mint_test, y_mint_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.43      1.00      0.60         6

    accuracy                           0.43        14
   macro avg       0.21      0.50      0.30        14
weighted avg       0.18      0.43      0.26        14



## Autoencoder + MPL

In [25]:
def Autoencoder(input_dim=256):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(128),
        LeakyReLU(0.01),

        Dense(32),
        BatchNormalization(),
        LeakyReLU(0.01),

        Dense(128),
        BatchNormalization(),
        LeakyReLU(0.01),

        Dense(input_dim, activation='sigmoid')  # sigmoid for multi-label
    ])

    model.compile(optimizer=Adam(1e-6), loss=MeanSquaredError())
    return model


In [26]:
autoencoder = Autoencoder(input_dim=X.shape[1])
autoencoder.fit(X_mint_train.toarray(), X_mint_train.toarray(), epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 255ms/step - loss: 0.2620 - val_loss: 0.2287
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - loss: 0.2631 - val_loss: 0.2287
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - loss: 0.2676 - val_loss: 0.2287
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 0.2629 - val_loss: 0.2286
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - loss: 0.2632 - val_loss: 0.2286
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - loss: 0.2625 - val_loss: 0.2286
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - loss: 0.2617 - val_loss: 0.2286
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - loss: 0.2681 - val_loss: 0.2286
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

<keras.src.callbacks.history.History at 0x13c3b3290>

In [27]:
X_train_encoded = autoencoder.predict(X_mint_train)
X_test_encoded = autoencoder.predict(X_mint_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 181ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


In [28]:
model = Model(input_dim=X_train_encoded.shape[1], output_dim=1)

In [29]:
model.fit(X_train_encoded, y_mint_train, validation_split=0.2, epochs=100, batch_size=32,
                    callbacks=[
                                EarlyStopping(monitor='val_loss',
                                             patience=5,
                                             restore_best_weights=True),
                                ReduceLROnPlateau(
                                  monitor='val_loss',
                                  factor=0.5,
                                  patience=5,
                                  verbose=1)
                               ]
                    )

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 338ms/step - accuracy: 0.6941 - loss: 0.6242 - val_accuracy: 0.1818 - val_loss: 0.7176 - learning_rate: 1.0000e-06
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.4593 - loss: 0.7634 - val_accuracy: 0.1818 - val_loss: 0.7157 - learning_rate: 1.0000e-06
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.6165 - loss: 0.7616 - val_accuracy: 0.1818 - val_loss: 0.7134 - learning_rate: 1.0000e-06
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.6884 - loss: 0.6584 - val_accuracy: 0.1818 - val_loss: 0.7108 - learning_rate: 1.0000e-06
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.5303 - loss: 0.7758 - val_accuracy: 0.1818 - val_loss: 0.7093 - learning_rate: 1.0000e-06
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x13d2fc140>

In [30]:
y_pred_prob = model.predict(X_test_encoded)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step


In [31]:
best_thresholds, _ = tune_thresholds(y_mint_test.reshape(-1, 1), y_pred_prob.reshape(-1, 1))

Label 0: Best threshold = 0.00, Best f1 = 0.6000


In [32]:
y_pred = (model.predict(X_test) >= best_thresholds).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step


### Report

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.43      1.00      0.60         6

    accuracy                           0.43        14
   macro avg       0.21      0.50      0.30        14
weighted avg       0.18      0.43      0.26        14

