## XGBoost

To ensure consistency and standardization across different models, each model inherits from a base `Model` class. This approach allows for the centralization of methods and code, ensuring that all models follow the same structure.

### Base Model Class

In [None]:
from abc import ABC, abstractmethod


MAIN_DIR = "/kaggle/working/"


class Model(ABC):
    @abstractmethod
    def fit(self, *args) -> None:
        pass

    @abstractmethod
    def evaluate(self, *args) -> float:
        pass

    @abstractmethod
    def make_plots(self, *args) -> None:
        pass

    @abstractmethod
    def create_confusion_matrix(self, *args) -> None:
        pass

### Callbacks

Once the model structure is defined, it's time to start training. However, for our purposes, we need to define the appropriate callbacks. In the case of XGBoost, these callbacks must follow a specific structure.

#### Callback Classes for XGBoost

In [None]:
from xgboost import callback
from typing import Any
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from xgboost import DMatrix

import numpy as np
import pandas as pd



class XgbCallback(callback.TrainingCallback):
    def __init__(self, x_val: pd.DataFrame, y_val: pd.DataFrame) -> None:
        super().__init__()
        self.x_val = x_val
        self.y_val = y_val
        self.d_matrix = DMatrix(data=x_val, label=y_val)



class F1ScoreCallback(XgbCallback):
    def __init__(self, x_val: pd.DataFrame, y_val: pd.DataFrame) -> None:
        super().__init__(x_val, y_val)
        self.f1_scores = []


    def after_iteration(self, model: Any, epoch: int, evals_log: dict[str, dict[str, list[float] | list[tuple[float, float]]]]) -> bool:
        y_pred_probs = model.predict(self.d_matrix, iteration_range=(0, epoch + 1))
        y_pred = np.argmax(y_pred_probs, axis=1)

        f1 = f1_score(self.y_val, y_pred, average='weighted')
        self.f1_scores.append(f1)

        print(f"Epoch: {epoch}, F1-Score: {f1:.4f}")

        return False


    def get_scores(self) -> list[float]:
        return self.f1_scores



class HistoryCallback(XgbCallback):
    def __init__(self, x_val: pd.DataFrame, y_val: pd.DataFrame) -> None:
        super().__init__(x_val, y_val)
        self.history = {
            "train_loss": [],
            "val_loss": [],
            "train_acc": [],
            "val_acc": []
        }


    def after_iteration(self, model: Any, epoch: int,
                        evals_log: dict[str, dict[str, list[float] | list[tuple[float, float]]]]) -> bool:

        train_loss = evals_log['train']['mlogloss'][-1] if 'train' in evals_log and 'mlogloss' in evals_log['train'] else None
        val_loss = evals_log['validation_0']['mlogloss'][-1] if 'validation_0' in evals_log and 'mlogloss' in evals_log['validation_0'] else None

        # Validation accuracy
        y_pred_probs = model.predict(self.d_matrix, iteration_range=(0, epoch + 1))
        y_pred = np.argmax(y_pred_probs, axis=1)

        val_acc = accuracy_score(self.y_val, y_pred)

        if 'train' in evals_log and 'accuracy' in evals_log['train']:
            train_acc = evals_log['train']['accuracy'][-1]
        else:
            train_acc = None  # No data

        self.history['train_loss'].append(train_loss)
        self.history['val_loss'].append(val_loss)
        self.history['train_acc'].append(train_acc)
        self.history['val_acc'].append(val_acc)

        print(f"Epoch: {epoch}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        return False


    def get_history(self) -> dict:
        return self.history

### Import statements

In [None]:
from typing import Tuple

import pandas as pd
import json
import logging
import numpy as np

from xgboost import XGBClassifier

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.stem import WordNetLemmatizer

from rapidfuzz import process

from src.models.model import Model
from xgboostCallbacks import F1ScoreCallback, HistoryCallback

## Class Description

### Overview of Key Parameters:

- **vectorizer**: It is used to transform textual or categorical data into a numerical format. In our case, it will be the TF-IDF matrix, which is commonly used to represent text as vectors.
- **known_words**: When saving the model, it is necessary to store the words the model was trained on so that when the model is loaded later, the transformation can be applied appropriately.

#### Model Parameters
- **n_estimators**: Specifies the number of decision trees to be used in the XGBoost model. Increasing this value can improve model accuracy but also increases the training time.
- **max_depth**: Sets the maximum depth of each tree. A high depth can lead to overfitting, while a very low depth may limit the model’s ability to learn.
- **subsample**: Defines the percentage of samples that will be used to train each tree. This can help avoid overfitting.
- **colsample_bytree**: Specifies the percentage of features that will be randomly selected for each tree. This also helps improve the model’s generalization.
- **verbosity**: Controls the printing of progress during training, which is useful for debugging and monitoring the training process.

### `XGBoostModel` Class

In [None]:
class XGBoostModel(Model):

    def __init__(self, model_name: str, model_category: str) -> None:
        self.model_name = model_name
        self.model_category = model_category
        self.vectorizer = None
        self.known_words = None
        self.cm: list[list[float]] = []
        self.f1_scores: list[float] = []
        self.history = {}
        self.model = XGBClassifier(
            n_estimators=1000,
            max_depth=10,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric='mlogloss',
            early_stopping_rounds=20,
            verbosity=3,
            # device='cuda'           # Run with GPU
        )

### Preprocessing Words to Known

**This step is only required if the model has been previously trained** and we are evaluating it on new data (e.g., when loading the model). This process transforms the input text string into one that most closely matches the `known_words` (the words used to construct the training TF-IDF matrix).

In [None]:
def preprocess_to_known(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
        lemmatizer = WordNetLemmatizer()

        tqdm.pandas()

        def process_text(text):
            words = text.split()
            processed_words = []

            for word in words:
                lemma = lemmatizer.lemmatize(word.lower())
                if lemma in self.known_words:
                    processed_words.append(lemma)
                else:
                    closest_match = process.extractOne(lemma, self.known_words)
                    if closest_match[1] > 80:
                        processed_words.append(closest_match[0])

            return " ".join(processed_words)

        df[column] = df[column].progress_apply(process_text)

        return df

### Converting Input Data to TF-IDF

The first step before **starting training** is to **create the TF-IDF matrix**. If the model is being run for the first time, a new vectorizer needs to be created to construct the matrix. However, if the model has been used previously, it must utilize the already saved vectorizer. Finally, the TF-IDF matrix is transformed into a *pandas DataFrame*.

In [None]:
def convert_to_tfidf(self, df: pd.DataFrame, column: str, remove_stop_words: bool) -> pd.DataFrame:
        if self.vectorizer is None:
            if remove_stop_words:
                self.vectorizer = TfidfVectorizer(stop_words='english')
            else:
                self.vectorizer = TfidfVectorizer(stop_words=None)

        logging.info(f"Start preprocess column: {column}")

        tfidf_matrix = self.vectorizer.fit_transform(tqdm(df[column], desc=f'Preprocessing: {column}'))

        logging.info(f"Shape of tfidf_matrix: {tfidf_matrix.shape}")

        tfidf_df = pd.DataFrame.sparse.from_spmatrix(
            tfidf_matrix,
            columns=self.vectorizer.get_feature_names_out(),
            index=df.index
        )

        df = df.drop(columns=[column])
        df = pd.concat([df, tfidf_df], axis=1)

        return df

### Adding Missing Columns

Since every input matrix must have the same shape as the training matrix, we need to add missing tokens to the already generated matrix (the column is filled with zeros). The matrix must also have tokens in the correct order. Therefore, to ensure reproducibility of matrix creation, each matrix is sorted based on tokens. As the order of tokens in the matrix will change, it's also required to pass the labels correctly.

In [None]:
def add_missing_columns(self, df: pd.DataFrame, label_name: str) -> pd.DataFrame:
        existing_words = set(df.columns)

        if self.known_words is None:
            self.known_words = set(df.columns)

        missing_words: set[str] = self.known_words - existing_words

        if missing_words:
            missing_columns_df = pd.DataFrame(
                data=0,
                index=df.index,
                columns=list(missing_words)
            )
            df = pd.concat([df, missing_columns_df], axis=1)

        sorted_columns = sorted(self.known_words)

        if label_name in sorted_columns:
            sorted_columns.remove(label_name)
        sorted_columns.append(label_name)

        df = df[sorted_columns]

        return df


# Multi-evaluation purpouses only
def domain_matches(df: pd.DataFrame) -> pd.DataFrame:
    df['domain'] = (df['sender'] == df['receiver']).astype(int)
    return df

### Training model

In [None]:
def fit(self, x_train: pd.DataFrame, y_train: pd.DataFrame, x_val: pd.DataFrame, y_val: pd.DataFrame) -> None:
        if self.known_words is None:
            self.known_words = set(x_train.columns)


        f1_score_callback = F1ScoreCallback(x_val, y_val)
        history_callback = HistoryCallback(x_val, y_val)

        self.model.fit(
            x_train,
            y_train,
            verbose=True,
            eval_set=[(x_val, y_val)],
            callbacks=[f1_score_callback, history_callback]
        )

        self.f1_scores = f1_score_callback.get_scores()
        self.history = history_callback.get_history()


def evaluate(self, x_test: pd.DataFrame, y_test: pd.DataFrame) -> float:
        y_pred = self.model.predict(x_test, validate_features=False)
        accuracy = accuracy_score(y_test, y_pred)

        return accuracy


def predict_probabilities(self, x_test: pd.DataFrame) -> np.narray:
        return self.model.predict_proba(x_test)

### Evaluation and plots

In [None]:
def create_confusion_matrix(self, x_test: pd.DataFrame, y_test: pd.DataFrame) -> None:
        y_pred_probs = self.model.predict_proba(x_test)
        y_pred = np.argmax(y_pred_probs, axis=1)

        y_true = y_test.values.flatten() if hasattr(y_test, 'values') else y_test
        self.cm = confusion_matrix(y_true, y_pred)


def make_plots(self) -> None:
        plt.figure(figsize=(8, 6))
        sns.heatmap(self.cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['legal', 'spam', 'phishing', 'fraud'],
                    yticklabels=['legal', 'spam', 'phishing', 'fraud'])
        plt.xlabel("Predicted Labels")
        plt.ylabel("True Labels")
        plt.title("Confusion Matrix")
        plt.savefig(f"{MAIN_DIR}saved-results/xgb/plots/{self.model_category}/{self.model_name}-conf-matrix.png")
        plt.clf()

        epochs = len(self.history["train_loss"])

        plt.figure(figsize=(18, 6))

        plt.subplot(1, 3, 1)

        plt.plot(range(epochs), self.history['val_loss'], label="Validation Loss", color='orange')
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.legend()
        plt.title("Loss over Epochs")

        plt.subplot(1, 3, 3)
        plt.plot(range(epochs), self.history['val_acc'], label="Validation Accuracy", color='orange')
        plt.xlabel("Epochs")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.title("Accuracy over Epochs")

        plt.subplot(1, 3, 2)
        plt.plot(range(len(self.f1_scores)), self.f1_scores, label="Validation F1", color='orange')
        plt.xlabel("Epochs")
        plt.ylabel("F1-Score")
        plt.legend()
        plt.title("F1-Score over Epochs")

        plt.tight_layout()
        plt.savefig(f"{MAIN_DIR}saved-results/xgb/plots/{self.model_category}/{self.model_name}-accuracy.png")
        plt.clf()

        final_accuracy = self.history['val_acc'][-1]
        final_f1_score = self.f1_scores[-1]
        with open(f"{MAIN_DIR}{self.model_name}-{self.model_category}", "w") as file:
            file.write(f"Accuracy: {final_accuracy}\n")
            file.write(f"F1-Score: {final_f1_score}\n")

### Loading and saving

When saving and reading the model, it is necessary to save the known words, which will then be used to create the TF-IDF matrix.

In [None]:
def save(self, model_save_path: str = "") -> None:
        if model_save_path == "":
            model_save_path = f"{MAIN_DIR}/models/xgb/saved/{self.model_category}-{self.model_name}"

        self.model.save_model(f"{model_save_path}.json")
        with open(f"{model_save_path}-known-words.json", "w") as f:
            json.dump(list(self.known_words), f)


def load_model(self, model_load_path: str) -> None:
        self.model = XGBClassifier()
        self.model.load_model(f"{model_load_path}.json")
        with open(f"{model_load_path}-known-words.json", "r") as f:
            self.known_words = set(json.load(f))

### Multi-Evaluation

To achieve the best approach, a multi-model strategy is used, where different models categorize messages based on various features (columns). Each model returns an array of probabilities, and this array is multiplied by the model's accuracy (used as a weight) to favor more accurate models. Finally, the final combined model's plots are saved.

In [None]:
def perform_multi_evaluation(xgb_boost_model_list: list[Tuple[XGBoostModel, str, float, bool, bool]], test_data: pd.DataFrame, test_labels: pd.DataFrame, label_name: str) -> None:

    cumulative_probabilities = np.zeros((test_data.shape[0], 4))

    for xgb_model, train_column, model_accuracy, remove_stop_words, use_domain in xgb_boost_model_list:
        
        if use_domain:
            final_test_data = domain_matches(test_data)
            final_test_data = final_test_data[[train_column]]
        else:
            temp_data = test_data[[train_column]]
            tfidf_matrix = xgb_model.convert_to_tfidf(temp_data, column=train_column,
                                                      remove_stop_words=remove_stop_words)
            final_test_data = xgb_model.add_missing_columns(tfidf_matrix, label_name)
            final_test_data = final_test_data.drop(columns=label_name)
        
        model_probabilities = xgb_model.model.predict_proba(final_test_data)
        cumulative_probabilities += model_probabilities * model_accuracy
        
    final_predictions = np.argmax(cumulative_probabilities, axis=1)
    final_accuracy = accuracy_score(test_labels, final_predictions)
    final_f1 = f1_score(test_labels, final_predictions, average='weighted')
    cm = confusion_matrix(test_labels, final_predictions)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['legal', 'spam', 'phishing', 'fraud'],
                yticklabels=['legal', 'spam', 'phishing', 'fraud'])
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix")
    plt.savefig(f"{MAIN_DIR}final-{xgb_boost_model_list[0][0].model_category}-conf-matrix.png")
    plt.clf()

    with open(f"{MAIN_DIR}final-{xgb_boost_model_list[0][0].model_category}-accuracy", "w") as file:
        file.write(f"Accuracy: {final_accuracy}\n")
        file.write(f"F1-Score: {final_f1}\n")


### Testing Different Configurations

Five different configurations were tested. Based on initial model performance, it was found that models removing stop words performed better, so this approach was applied to all other models. Different types of models worked on different dataset portions due to varying training times and limited hardware resources (GPU P100 on Kaggle).

| Model Category           | Used Models                     | Remove Stop Words | Dataset                           |
|--------------------------|---------------------------------|-------------------|-----------------------------------|
| only-body                | body                            | ❌                | final-with-stop-words.csv         |
| only-body-with-stop-words| body                            | ✅                | final.csv                         |
| body-subject             | body, subject                   | ✅                | final.csv                         |
| body-domain              | body, domain                    | ✅                | final-domain-only.csv             |
| full                     | body, domain, subject           | ✅                | final-domain-only.csv             |



In [None]:
MODELS_PARAMS = [
    {"models": [("body-model", "body", 4_000, False, False)], "model_category": "only-body", "dataset": "/kaggle/input/xgboost-phishing/final.csv", "add_subject": False, "add_domain": False},
    {"models": [("body-model", "body", 4_000, True, False)], "model_category": "only-body-without-stop-words", "dataset": "/kaggle/input/xgboost-phishing/final-with-stop-words.csv", "add_subject": False, "add_domain": False},
    {"models": [("body-model", "body", 4_000, True, False), ("subject-model", "subject", 16_000, True, False)], "model_category": "body-subject", "dataset": "/kaggle/input/xgboost-phishing/final.csv"},
    {"models": [("body-model", "body", 4_000, True, False), ("domain-model", "domain", 32_000, True, True)], "model_category": "body-domain", "dataset": "/kaggle/input/xgboost-phishing/final-domain-only.csv"},
    {"models": [("body-model", "body", 4_000, True, False), ("domain-model", "domain", 32_000, True, True), ("subject-model", "subject", 16_000, True, False)], "model_category": "full", "dataset": "/kaggle/input/xgboost-phishing/final-domain-only.csv"}
]


for param in MODELS_PARAMS:

    models: list[Tuple[XGBoostModel, float]] = []

    data = pd.read_csv(param["dataset"])
    label_column = "$label"
    data = data.rename(columns={'label': label_column})

    for model_name, train_column, data_length, remove_stop_words, use_domain in param["models"]:

        input_data = data.sample(n=data_length, random_state=43)
        xgb_model = XGBoostModel(model_name, param["model_category"])

        if use_domain:
            train_data = domain_matches(input_data)
            train_labels = train_data[[label_column]]
            train_data = train_data[[train_column]]
        else:
            input_data = input_data[[train_column, label_column]]
            tfidf_matrix = xgb_model.convert_to_tfidf(input_data, column=train_column, remove_stop_words=remove_stop_words)
            train_data = xgb_model.add_missing_columns(tfidf_matrix, label_column)
            train_labels = train_data[[label_column]]
            train_data = train_data.drop(columns=[label_column])

        X_train, X_temp, Y_train, Y_temp = train_test_split(train_data, train_labels, test_size=0.3, random_state=28)
        X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=28)

        xgb_model.fit(X_train, Y_train, X_val, Y_val)
        xgb_model.create_confusion_matrix(X_test, Y_test)
        xgb_model.make_plots()
        xgb_model.save()

        print(hash(frozenset(xgb_model.known_words)))
        model_accuracy = xgb_model.evaluate(X_test, Y_test)

        print(model_accuracy)
        models.append((xgb_model, train_column, model_accuracy, remove_stop_words, use_domain))

    test_data = data.sample(n=data_length, random_state=43)
    test_labels = test_data[[label_column]]
    test_data = test_data.drop(columns=[label_column])
    perform_multi_evaluation(models, test_data, test_labels, label_column)