# Baseline Model using TF-IDF for binary

## Set up

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score


In [3]:
PATH = Path.cwd().parents[2]
DATA_PATH = os.path.join(PATH, 'data/processed/tf_idf')

## List Traditional ML model to compare

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

models = {
  "LogisticRegression()": LogisticRegression(),
  "DecisionTreeClassifier()": DecisionTreeClassifier(),
  "RandomForestClassifier()": RandomForestClassifier(),
  "AdaBoostClassifier()": AdaBoostClassifier(),
  "ExtraTreesClassifier()": ExtraTreesClassifier(),
  "XGBClassifier()": XGBClassifier(),
  "LGBMClassifier()": LGBMClassifier(),
  "SVC()": SVC(),
  "GaussianNB()": GaussianNB(),
  "KNeighborsClassifier()": KNeighborsClassifier(),
  "SGDClassifier()": SGDClassifier(),
  "MLPClassifier()": MLPClassifier(),
}

## Load Dataset

In [5]:
X = load_npz(os.path.join(DATA_PATH, "tfidf_vector_sol.npz"))
y = np.load(os.path.join(DATA_PATH, "labels_sol.npy"))

In [6]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 33333 stored elements and shape (69, 5426)>

In [7]:
y[:, 0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1])

In [8]:
y[:, 1]

array([0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0])

In [9]:
y[:, 2]

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0])

## Prepare Features and Labels

In [10]:
y_mint = y[:, 0]
y_leak = y[:, 1]
y_limit = y[:, 2]

## Train/Test Split

In [11]:
X_mint_train, X_mint_test, y_mint_train, y_mint_test = train_test_split(
    X, y_mint, test_size=0.2, random_state=42
)

X_leak_train, X_leak_test, y_leak_train, y_leak_test = train_test_split(
    X, y_leak, test_size=0.2, random_state=42
)

X_limit_train, X_limit_test, y_limit_train, y_limit_test = train_test_split(
    X, y_limit, test_size=0.2, random_state=42
)

## Run all models and collect reports

## Show report

In [12]:
data = {
  'mint': {
    "X_train": X_mint_train,
    "X_test": X_mint_test,
    "y_train": y_mint_train,
    "y_test": y_mint_test
  },
  'leak': {
    "X_train": X_leak_train,
    "X_test": X_leak_test,
    "y_train": y_leak_train,
    "y_test": y_leak_test
  },
  'limit': {
    "X_train": X_limit_train,
    "X_test": X_limit_test,
    "y_train": y_limit_train,
    "y_test": y_limit_test
  }
}

reports = []

for d in data.values():
    X_train = d['X_train']
    X_test = d['X_test']
    y_train = d['y_train']
    y_test = d['y_test']

    # Convert to dense for models that require it
    def maybe_dense(model, X):
        # List of models that require dense input
        dense_models = (GaussianNB, MLPClassifier)
        if isinstance(model, dense_models):
            return X.toarray()
        return X

    def get_report_all_ml_dense(X_train, y_train, X_test, y_test):
        report_list = []
        for name, model in models.items():
            X_train_mod = maybe_dense(model, X_train)
            X_test_mod = maybe_dense(model, X_test)
            model.fit(X_train_mod, y_train)
            y_pred = model.predict(X_test_mod)
            report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
            avg_scores = report_dict["macro avg"]
            report_list.append({
                "Model": name,
                "Precision": avg_scores["precision"],
                "Recall": avg_scores["recall"],
                "F1-score": avg_scores["f1-score"]
            })
        df_report = pd.DataFrame(report_list)
        df_report = df_report.sort_values("F1-score", ascending=False).reset_index(drop=True)
        return df_report

    df_report = get_report_all_ml_dense(X_train, y_train, X_test, y_test)
    reports.append(df_report)

[LightGBM] [Info] Number of positive: 17, number of negative: 38
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000884 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5158
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 420
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.309091 -> initscore=-0.804373
[LightGBM] [Info] Start training from score -0.804373
[LightGBM] [Info] Number of positive: 7, number of negative: 48
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5158
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 420
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.127273 -> initscore=-1.925291
[LightGBM] [Info] S

In [13]:
reports[0]

Unnamed: 0,Model,Precision,Recall,F1-score
0,DecisionTreeClassifier(),1.0,1.0,1.0
1,XGBClassifier(),0.875,0.954545,0.904762
2,LGBMClassifier(),0.958333,0.833333,0.878261
3,AdaBoostClassifier(),0.958333,0.833333,0.878261
4,GaussianNB(),0.923077,0.666667,0.708333
5,RandomForestClassifier(),0.666667,0.621212,0.634783
6,SGDClassifier(),0.666667,0.621212,0.634783
7,ExtraTreesClassifier(),0.666667,0.621212,0.634783
8,KNeighborsClassifier(),0.604167,0.651515,0.590643
9,MLPClassifier(),0.575758,0.575758,0.575758


In [14]:
reports[1]

Unnamed: 0,Model,Precision,Recall,F1-score
0,MLPClassifier(),0.961538,0.75,0.813333
1,DecisionTreeClassifier(),0.621212,0.666667,0.634783
2,AdaBoostClassifier(),0.621212,0.666667,0.634783
3,GaussianNB(),0.544444,0.583333,0.52381
4,SGDClassifier(),0.544444,0.583333,0.52381
5,LogisticRegression(),0.428571,0.5,0.461538
6,RandomForestClassifier(),0.428571,0.5,0.461538
7,ExtraTreesClassifier(),0.428571,0.5,0.461538
8,SVC(),0.428571,0.5,0.461538
9,LGBMClassifier(),0.428571,0.5,0.461538


In [15]:
reports[2]

Unnamed: 0,Model,Precision,Recall,F1-score
0,RandomForestClassifier(),0.9,0.95,0.918129
1,SVC(),0.9,0.95,0.918129
2,LGBMClassifier(),0.825,0.825,0.825
3,LogisticRegression(),0.825,0.825,0.825
4,KNeighborsClassifier(),0.785714,0.85,0.775401
5,DecisionTreeClassifier(),0.744444,0.775,0.754386
6,ExtraTreesClassifier(),0.744444,0.775,0.754386
7,AdaBoostClassifier(),0.6875,0.725,0.688889
8,MLPClassifier(),0.6875,0.725,0.688889
9,GaussianNB(),0.642857,0.675,0.625668


## MLP

### Build model

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import MeanSquaredError

2025-07-15 15:46:31.925555: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
def Model(input_dim, output_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(512, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.4),

        Dense(256, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.3),

        Dense(128, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.2),

        Dense(output_dim, activation='sigmoid')  # sigmoid for multi-label
    ])

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=1e-6),
        metrics=['accuracy']
    )
    return model

In [18]:
model = Model(input_dim=X.shape[1], output_dim=1)

### Train

In [19]:
model.fit(X_mint_train, y_mint_train, validation_split=0.2, epochs=100, batch_size=32,
                    callbacks=[
                                EarlyStopping(monitor='val_loss',
                                             patience=5,
                                             restore_best_weights=True),
                                ReduceLROnPlateau(
                                  monitor='val_loss',
                                  factor=0.5,
                                  patience=5,
                                  verbose=1)
                               ]
                    )

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 470ms/step - accuracy: 0.4081 - loss: 0.9304 - val_accuracy: 0.4545 - val_loss: 0.7463 - learning_rate: 1.0000e-06
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step - accuracy: 0.3267 - loss: 1.0895 - val_accuracy: 0.5455 - val_loss: 0.7461 - learning_rate: 1.0000e-06
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.4536 - loss: 0.9324 - val_accuracy: 0.5455 - val_loss: 0.7459 - learning_rate: 1.0000e-06
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.3826 - loss: 0.9003 - val_accuracy: 0.3636 - val_loss: 0.7457 - learning_rate: 1.0000e-06
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step - accuracy: 0.4129 - loss: 1.0080 - val_accuracy: 0.3636 - val_loss: 0.7456 - learning_rate: 1.0000e-06
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x13345cd10>

### Predict

In [20]:
y_mint_test_prob = model.predict(X_mint_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step


In [21]:
def tune_thresholds(y_true, y_pred_prob, metric='f1'):
    y_true = np.asarray(y_true)          # Fix: convert to NumPy
    y_pred_prob = np.asarray(y_pred_prob)

    best_thresholds = []
    best_scores = []

    for i in range(y_true.shape[1]):
        label_true = y_true[:, i]
        label_probs = y_pred_prob[:, i]  # Fix here too

        thresholds = np.linspace(0.0, 1.0, 101)
        scores = []

        for t in thresholds:
            label_pred = (label_probs >= t).astype(int)
            if metric == 'f1':
                score = f1_score(label_true, label_pred, zero_division=0)
            scores.append(score)

        best_t = thresholds[np.argmax(scores)]
        best_score = np.max(scores)

        best_thresholds.append(best_t)
        best_scores.append(best_score)

        print(f"Label {i}: Best threshold = {best_t:.2f}, Best {metric} = {best_score:.4f}")

    return best_thresholds, best_scores


In [22]:
# Reshape to (n_samples, 1) for binary classification
best_thresholds, _ = tune_thresholds(y_mint_test.reshape(-1, 1), y_mint_test_prob.reshape(-1, 1))

Label 0: Best threshold = 0.00, Best f1 = 0.3529


In [23]:
y_mint_pred = (model.predict(X_mint_test) >= best_thresholds).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step


### Report

In [24]:
print(classification_report(y_mint_test, y_mint_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.21      1.00      0.35         3

    accuracy                           0.21        14
   macro avg       0.11      0.50      0.18        14
weighted avg       0.05      0.21      0.08        14



## Autoencoder + MPL

In [25]:
def Autoencoder(input_dim=256):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(128),
        LeakyReLU(0.01),

        Dense(32),
        BatchNormalization(),
        LeakyReLU(0.01),

        Dense(128),
        BatchNormalization(),
        LeakyReLU(0.01),

        Dense(input_dim, activation='sigmoid')  # sigmoid for multi-label
    ])

    model.compile(optimizer=Adam(1e-6), loss=MeanSquaredError())
    return model


In [27]:
autoencoder = Autoencoder(input_dim=X.shape[1])
autoencoder.fit(X_mint_train.toarray(), X_mint_train.toarray(), epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 226ms/step - loss: 0.2496 - val_loss: 0.2481
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 0.2495 - val_loss: 0.2481
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 0.2496 - val_loss: 0.2481
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - loss: 0.2495 - val_loss: 0.2481
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - loss: 0.2495 - val_loss: 0.2481
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - loss: 0.2495 - val_loss: 0.2481
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 0.2494 - val_loss: 0.2481
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 0.2494 - val_loss: 0.2481
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x1336f5910>

In [28]:
X_train_encoded = autoencoder.predict(X_mint_train)
X_test_encoded = autoencoder.predict(X_mint_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step


In [29]:
model = Model(input_dim=X_train_encoded.shape[1], output_dim=1)

In [30]:
model.fit(X_train_encoded, y_mint_train, validation_split=0.2, epochs=100, batch_size=32,
                    callbacks=[
                                EarlyStopping(monitor='val_loss',
                                             patience=5,
                                             restore_best_weights=True),
                                ReduceLROnPlateau(
                                  monitor='val_loss',
                                  factor=0.5,
                                  patience=5,
                                  verbose=1)
                               ]
                    )

Epoch 1/100


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 337ms/step - accuracy: 0.7244 - loss: 0.6808 - val_accuracy: 0.5455 - val_loss: 0.7433 - learning_rate: 1.0000e-06
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.6430 - loss: 0.6618 - val_accuracy: 0.5455 - val_loss: 0.7434 - learning_rate: 1.0000e-06
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.6941 - loss: 0.6851 - val_accuracy: 0.5455 - val_loss: 0.7436 - learning_rate: 1.0000e-06
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - accuracy: 0.7604 - loss: 0.6127 - val_accuracy: 0.5455 - val_loss: 0.7437 - learning_rate: 1.0000e-06
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.6174 - loss: 0.7717 - val_accuracy: 0.5455 - val_loss: 0.7439 - learning_rate: 1.0000e-06
Epoch 6/100
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[

<keras.src.callbacks.history.History at 0x1341739b0>

In [31]:
y_pred_prob = model.predict(X_test_encoded)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step


In [33]:
best_thresholds, _ = tune_thresholds(y_mint_test.reshape(-1, 1), y_pred_prob.reshape(-1, 1))

Label 0: Best threshold = 0.00, Best f1 = 0.3529


In [34]:
y_pred = (model.predict(X_test) >= best_thresholds).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 358ms/step


### Report

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.29      1.00      0.44         4

    accuracy                           0.29        14
   macro avg       0.14      0.50      0.22        14
weighted avg       0.08      0.29      0.13        14

