## Install packages

In [None]:
%%time
!pip install sentence-transformers hf_xet
#jina-embeddings benefit from FlashAttention-2
#!pip install flash-attn --no-build-isolation
#!pip install -v -U flash-attn
!pip install ninja pyarrow
!pip install pyod catboost
!pip install ftfy emoji einops
!pip install jupyter_capture_output
!pip install --no-deps dask-expr
!pip install scipy betacal
print("\n")

## Load packages

In [None]:
%%time
import time, os, re, torch, ftfy, emoji, joblib, jupyter_capture_output
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd
import multiprocessing as mp
from tqdm import tqdm
from tqdm.dask import TqdmCallback
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from pyod.models.hbos import HBOS
from catboost import CatBoostClassifier
from betacal import BetaCalibration
from sklearn.metrics import classification_report, roc_curve, precision_recall_curve
from sklearn.metrics import accuracy_score, cohen_kappa_score, roc_auc_score, average_precision_score
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", message="`torch_dtype` is deprecated")

## Experiment settings

In [None]:
# research code for HIPSTer project: Hybrid, Information, Psychological, Societal Threats
# handling system for public security domain practitioners, businesses, and education (HIPSTer)

dataset = "LtHate_v1" # choose text dataset: LtHate LtHate_v1 RuToxic DynaHate EnBerkeley EnToxiGen EnSuperset
datasetFolder = './data/'
resultsFolder = './results/'
vectors = ["potion", "snow", "jina", "e5"] # choose among modern vectorizers: potion snow jina e5 gte
chunk_size_setting = 1024 # large=8192 (if many texts or non-gte vectorizers + GPU) or small=512
batch_size_setting = 32 # large=1024 (if many texts or non-gte vectorizers + GPU) or small=64
compDevice = 'cuda' if torch.cuda.is_available() else 'cpu'
useDimensionalityReduction = False
useCatBoostOutputCallibration = False

k = 5 # Number of folds for StratifiedKFold cross-validation, i.e. 10-fold CV
num_vars = 32 # Number of variables after dimensionality reduction with PCA
num_tree = 500 # Maximum possible number of trees to grow for CatBoost model
eval_kpi = "AUC" # Success metric to track performance on the validation set

In [None]:
cvLogFilename = resultsFolder + dataset + "-CV-log.txt"
resultsTableFile = resultsFolder + dataset + "-table.txt"
resultsTableSummary = resultsFolder + dataset + "-table.csv"
rocPlotFilename = resultsFolder + dataset + "-fig-ROC.png"
prcPlotFilename = resultsFolder + dataset + "-fig-PRC.png"
if not os.path.exists(resultsFolder):
    os.makedirs(resultsFolder)

## Helper functions

In [None]:
def fix_punctuation(text, toneDown=True):
    # First, use ftfy to fix any encoding issues
    if hasattr(text, '__len__'):
        text = ftfy.fix_text(text)

        # Custom rules for punctuation fixing
        rules = [
            # Remove http and https links
            (r'https?://\S+', ''),
            # Remove consecutive repetitive punctuation, but keep a maximum of two for emphasis (e.g., !!)
            (r'([,\.?!])\1{2,}', r'\1\1'),
            # Add space after comma, period, question mark, or exclamation mark if not followed by space
            (r'([,\.?!])(?=[^\s])', r'\1 '),
            # Remove space before comma, period, question mark, or exclamation mark
            (r'\s+([,\.?!])', r'\1'),
            # Fix multiple spaces
            (r'\s{2,}', ' '),
            # Ensure numbers have space before and after, except when punctuation or hyphen follows
            (r'(\d)(?=[^\s\d,\.?!-])', r'\1 '),
            (r'(?<=[^\s\d-])(\d)', r' \1')
        ]

        if toneDown:
            rules.append((r'[?!]', '.'))

        # Replace emoji with :shortcode:
        text = emoji.demojize(text, delimiters=(" ::", ":: "))

        # Apply each rule
        for pattern, replacement in rules:
            text = re.sub(pattern, replacement, text)

        text = text.strip()
    else:
        text = ''
    return text

def get_vectorizer_link(name: str) -> str:
    """Return full Hugging Face path for a given short vectorizer name."""
    if name == "jina":
        return "jinaai/jina-embeddings-v3"
    elif name == "snow":
        return "Snowflake/snowflake-arctic-embed-l-v2.0"
    elif name == "gte":
        return "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
    elif name == "potion":
        return "minishlab/potion-base-2M"
    elif name == "e5":
        return "intfloat/multilingual-e5-large-instruct"
    else:
        raise ValueError(f"Unknown vectorizer name: {name}")

def clean_and_vectorize(df, fix_punctuation, sentvec="e5", device="cuda", normalize_embeddings=False, chunk_size=256, batch_size=32):
    """
    Cleans text data and vectorizes it using a sentence transformer model.

    Parameters:
    - df (pd.DataFrame): Input pandas DataFrame containing text data.
    - fix_punctuation (function): Function to preprocess and fix punctuation in text.
    - sentvec (str): Identifier for sentence vectorization method, e.g., "e5-large-instruct".
    - device (str, default="cuda"): Device to use for the sentence transformer model, e.g., "cpu" or "cuda".
    - normalize_embeddings (bool): Whether to normalize embeddings.
    - chunk_size (int, default=1024): Number of text samples per chunk for encoding.
    - batch_size (int, default=128): Batch size for encoding in the transformer.

    Returns:
    - pd.DataFrame: DataFrame containing the resulting embeddings with appropriate column names.
    """

    # Load the sentence transformer model based on `sentvec`
    if sentvec == "jina":
        st = SentenceTransformer(get_vectorizer_link(sentvec), trust_remote_code=True, device=device)
    else:
        st = SentenceTransformer(get_vectorizer_link(sentvec), device=device)

    # Partition data for parallel processing
    n_partitions = mp.cpu_count()
    ddf = dd.from_pandas(df, npartitions=n_partitions)

    with TqdmCallback(desc="Cleaning text"):
        texts = ddf.apply(lambda x: fix_punctuation(x.iloc[1]), axis=1, meta=pd.Series(dtype="str")).compute(scheduler="processes").tolist()

    # Determine prompt for the selected sentence vectorization method
    st_prompt = "query: " if sentvec == "e5" else ""

    # Process texts in chunks for memory efficiency
    results = []
    total_chunks = (len(texts) + chunk_size - 1) // chunk_size  # Calculate total number of chunks
    width = len(str(total_chunks))  # Width for chunk number formatting

    for i in range(0, len(texts), chunk_size):
        torch.cuda.empty_cache()  # Clear GPU cache if necessary
        current_chunk = i // chunk_size + 1  # Current chunk number
        print(f"\rChunk: {current_chunk:0{width}}/{total_chunks:0{width}} ", end='', flush=True)
        chunk = texts[i:i + chunk_size]
        result = st.encode(chunk, batch_size=batch_size, normalize_embeddings=normalize_embeddings, show_progress_bar=True)
        results.append(result)

    # Clean up GPU memory
    torch.cuda.empty_cache()

    # Concatenate results and format as DataFrame
    X = np.concatenate(results, axis=0)
    df_embeddings = pd.DataFrame(X)
    df_embeddings.columns = [f'X{i+1}' for i in range(df_embeddings.shape[1])]

    return df_embeddings

## Load & vectorize data

In [None]:
%%time

# Load text comments dataset
if dataset == 'DynaHate':
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python', usecols=['text', 'label'])
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    di = {"nothate": 0, "hate": 1}
    y = df[0].map(di)
elif dataset == 'EnBerkeley':
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python', usecols=['text', 'contains_hate'])
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    y = 1 - df[0]
elif dataset == 'EnSuperset':
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python', usecols=['text', 'labels'])
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    y = df[0]
elif dataset == 'EnToxiGen':
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python', usecols=['generation', 'prompt_label'])
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    y = df[0]
elif dataset == 'LtHate':
    dfA = pd.read_csv(datasetFolder + dataset + '.csv', engine='python')
    dfA = dfA[dfA.columns[::-1]]
    dfA.columns = [0, 1]
    di = {"No": 0, "Yes": 1}
    yA = dfA[0].map(di)
    dfB = pd.read_csv(datasetFolder + 'Semantika_2.txt', sep='__', header=None, usecols=[2], engine='python')
    dfB = dfB[2].str.split(" ", n=1, expand=True)
    di = {"neutral": 0, "offensive": 1}
    yB = dfB[0].map(di)
    df = pd.concat([dfA, dfB], ignore_index=True)
    y = pd.concat([yA, yB], ignore_index=True)
elif dataset == 'LtHate_v1':
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python')
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    di = {"No": 0, "Yes": 1}
    y = df[0].map(di)
elif dataset == 'LtEmocionalumas':
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python')
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    di = {"No": 0, "Low": 0, "Medium": 1, "High": 1, "Critical": 1}
    y = df[0].map(di)
elif dataset == 'LtManipuliacijos':
    excel_file = pd.ExcelFile(datasetFolder + dataset + '.xlsx')
    all_comments = []
    all_labels = []
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(datasetFolder + 'LtManipuliacijos.xlsx', sheet_name=sheet_name, usecols=[1])
        sheet_comments = df['Komentaras'].astype(str).str.strip()
        sheet_comments = sheet_comments[sheet_comments != ''].tolist()
        all_comments.extend(sheet_comments)
        if sheet_name == 'Manipuliaciniai':
            all_labels.extend([1] * len(sheet_comments))
        else:
            all_labels.extend([0] * len(sheet_comments))
    df = pd.DataFrame({0: all_labels, 1: all_comments})
    y = df[0]      
elif dataset == 'RuToxic':
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python')
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    y = df[0]
else:
    df = pd.read_csv('Semantika_2.txt', sep='__', header=None, usecols=[2], engine='python')
    df = df[2].str.split(" ", n=1, expand=True)
    di = {"neutral": 0, "offensive": 1}
    y = df[0].map(di)

In [None]:
%%time

# Vectorize text data
for sentvec in vectors:
    xVarFilename = datasetFolder + dataset + "-X-" + sentvec + ".parquet"
    files_exist_condition = os.path.exists(xVarFilename)
    print(xVarFilename)

    if not files_exist_condition:
        if compDevice == "cpu":
            print("Vectorizing text on CPU...")
            df_post = clean_and_vectorize(df, fix_punctuation, sentvec, compDevice, False, chunk_size_setting, batch_size_setting)
            df_post.to_parquet(xVarFilename, engine="pyarrow")
        else:
            try:
                print("Vectorizing text on GPU 0...")
                torch.cuda.set_device(0)
                torch.cuda.empty_cache()
                df_post = clean_and_vectorize(df, fix_punctuation, sentvec, compDevice, False, chunk_size_setting, batch_size_setting)
                df_post.to_parquet(xVarFilename, engine="pyarrow")
            except RuntimeError as e:
                if "CUDA out of memory" in str(e):
                    print("Vectorizing text on GPU 1...")
                    torch.cuda.set_device(1)
                    torch.cuda.empty_cache()
                    df_post = clean_and_vectorize(df, fix_punctuation, sentvec, compDevice, False, chunk_size_setting, batch_size_setting)
                    df_post.to_parquet(xVarFilename, engine="pyarrow")
                else:
                    # Re-raise if it's not an OOM error
                    raise
            torch.cuda.empty_cache()

# Read vectorized data
XX = []
for sentvec in vectors:
    xVarFilename = datasetFolder + dataset + "-X-" + sentvec + ".parquet"
    df_post = dd.read_parquet(xVarFilename, engine='pyarrow')
    XX.append(df_post.compute().to_numpy())

## Machine learning (CV using FOR loop)

In [None]:
def autoflip_score(y_true, y_scores):
    # Calculate ROC AUC scores for both variants
    auc_score = roc_auc_score(y_true, y_scores)
    auc_score_inverted = roc_auc_score(y_true, 1 - y_scores)
    # Determine which AUC is higher and return the corresponding scores
    if auc_score > auc_score_inverted:
        return y_scores
    else:
        return 1 - y_scores

In [None]:
%%time
%%capture_text --path $cvLogFilename

X = range(len(y))
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
cb_task_type = 'GPU' if compDevice == 'cuda' else 'CPU'

# Predicted scores to concatenate results across validation folds
tst_idx, ground_truth = [], []
os_scores = [[] for _ in vectors]  # One list per vectorizer
cb_scores = [[] for _ in vectors]  # One list per vectorizer

i = 0 # Perform the k-fold cross-validation
for train_index, test_index in skf.split(X, y):

    y_train, y_test = y[train_index], y[test_index]
    imbalance_scalar = (y_train == 0).sum() / (y_train == 1).sum()
    tst_idx.extend(test_index)
    ground_truth.extend(y_test.tolist())

    print("\nCV fold %d/%d" % (i + 1, k), flush=True)

    os_fold_scores = []
    cb_fold_scores = []

    # One-class (1c) classification: Histogram-based outlier score (HBOS)
    for j, sentvec in enumerate(vectors):
        start = pd.Timestamp.now()
        X_train, X_test = XX[j][train_index], XX[j][test_index]
        if useDimensionalityReduction:
            pca = PCA(n_components=num_vars)
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)
        clf = HBOS(contamination=0.01)
        clf.fit(X_train[y_train==1,:])
        score = -1 * clf.decision_function(X_test) / 10000 # make score more aesthetic (calibration)
        score = autoflip_score(y_test, score) # HBOS is 1c but ROC/PRC are for 2c so some sanity check
        print("pyodHBOS %s AUC=%5.3f" % (vectors[j], roc_auc_score(y_test, score)))
        os_scores[j].extend(score.tolist())
        print(str(pd.Timestamp.now()-start))
        

    # Two-class (2c) classification: CatBoost classifier (detection task)
    for j, sentvec in enumerate(vectors):
        start = pd.Timestamp.now()
        X_train, X_test = XX[j][train_index], XX[j][test_index]
        if useDimensionalityReduction:
            pca = PCA(n_components=num_vars)
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)
        model = CatBoostClassifier(iterations=num_tree, learning_rate=0.05, task_type=cb_task_type, allow_writing_files=False,
                                   eval_metric=eval_kpi, scale_pos_weight=imbalance_scalar)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)
        model.fit(X_trn, y_trn, eval_set=(X_val, y_val), use_best_model=True, early_stopping_rounds=10, metric_period=5, verbose=False)
        score = model.predict_proba(X_test)[:, 1]
        if useCatBoostOutputCallibration:
            prob_pos_val = model.predict_proba(X_val)[:, 1]
            beta_calibrator = BetaCalibration(parameters="abm")
            beta_calibrator.fit(prob_pos_val, y_val)
            score = beta_calibrator.predict(score)
    
        print("catBoost %s AUC=%5.3f" % (vectors[j], roc_auc_score(y_test, score)))
        cb_scores[j].extend(score.tolist())
        print(str(pd.Timestamp.now()-start))
        
    i = i + 1
    
torch.cuda.empty_cache()
print("\n")

## Detection task results

In [None]:
# plot ROC
roc_results = []
fig, ax3 = plt.subplots()
colors = sns.color_palette("pastel", len(os_scores))

for i, vec_scores in enumerate(os_scores):
    fpr, tpr, thresholds = roc_curve(ground_truth, vec_scores, pos_label=1)
    roc_auc = roc_auc_score(ground_truth, vec_scores)
    roc_results.append({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds, 'auc': roc_auc})
    label = f"1c {vectors[i]} (AUC={roc_auc:.3f})"
    ax3.plot(fpr, tpr, label=label, color=colors[i])

for i, vec_scores in enumerate(cb_scores):
    fpr, tpr, thresholds = roc_curve(ground_truth, vec_scores, pos_label=1)
    roc_auc = roc_auc_score(ground_truth, vec_scores)
    roc_results.append({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds, 'auc': roc_auc})
    label = f"2c {vectors[i]} (AUC={roc_auc:.3f})"
    ax3.plot(fpr, tpr, label=label, color=f"C{i}")

# Plot diagonal reference line
ax3.plot([0, 1], [0, 1], 'k--', lw=1)

ax3.set_xlabel('False Positive Rate')
ax3.set_ylabel('True Positive Rate')
ax3.set_title(f"{dataset} (n={len(y)}, target={100*np.sum(y==1)/len(y):.2f}%) → ROC")
ax3.set_xlim([-0.02, 1.02])
ax3.set_ylim([-0.02, 1.02])
ax3.set_aspect('equal', adjustable='box')
ax3.grid(color='darkgrey', linestyle='-', linewidth=0.5)
ax3.legend(loc='lower right')
plt.show()
fig.savefig(rocPlotFilename)

In [None]:
# plot PRC
prc_results = []
fig, ax3 = plt.subplots()
colors = sns.color_palette("pastel", len(os_scores))
for i, vec_scores in enumerate(os_scores):
    precision, recall, _ = precision_recall_curve(ground_truth, vec_scores, pos_label=1)
    ap = average_precision_score(ground_truth, vec_scores, pos_label=1)
    prc_results.append(ap)
    label = f"1c {vectors[i]} (AUC={ap:.3f})"
    ax3.plot(recall, precision, label=label, color=colors[i])
for i, vec_scores in enumerate(cb_scores):
    precision, recall, _ = precision_recall_curve(ground_truth, vec_scores, pos_label=1)
    ap = average_precision_score(ground_truth, vec_scores, pos_label=1)
    prc_results.append(ap)
    label = f"1c {vectors[i]} (AUC={ap:.3f})"
    ax3.plot(recall, precision, label=label, color=f"C{i}")
ax3.set_title(f"{dataset} (n={len(y)}, target={100*np.sum(y==1)/len(y):.2f}%) → PRC")
ax3.set_xlabel("Recall")
ax3.set_ylabel("Precision")
ax3.set_xlim([-0.02, 1.02])
ax3.set_ylim([-0.02, 1.02])
ax3.set_aspect('equal', adjustable='box')
ax3.grid(color='darkgrey', linestyle='-', linewidth=0.5)
ax3.legend(loc='lower left')
plt.show()
fig.savefig(prcPlotFilename)

In [None]:
%%capture_text --path $resultsTableFile

# Constants
kappa_line = "cohens kappa %4.2f\n"
title_line = '\n-----------------------------------------------------\n'

# Ultra-compact score collections
models = [('1c', os_scores), ('2c', cb_scores)]

# Initialize list to store CSV data
csv_data = []

# Process all scores
result_idx = 0
for prefix, scores_list in models:
    for i, vec_scores in enumerate(scores_list):
        roc, ap = roc_results[result_idx], prc_results[result_idx]
        variant_name = vectors[i]

        # EER threshold
        fpr, tpr, thresholds = roc['fpr'], roc['tpr'], roc['thresholds']
        specificity = 1 - fpr
        eer_idx = np.argmin(np.abs(tpr - specificity))
        optimal_threshold = thresholds[eer_idx]

        # Predictions and metrics
        predictions = np.array(vec_scores) > optimal_threshold
        accuracy = accuracy_score(ground_truth, predictions)
        kappa = cohen_kappa_score(ground_truth, predictions)

        # Print stylized results (original format)
        print(
            f'\n'
            f'{prefix} {variant_name} : {dataset} AUC-ROC = {roc["auc"]:.3f}\n'
            f'{prefix} {variant_name} : {dataset} AUC-PRC = {ap:.3f}\n'
            f'Threshold (EER) = {optimal_threshold:.6f}\n'
            f'{title_line}'
            f'{classification_report(ground_truth, predictions)}\n'
            f'{kappa_line % kappa}'
            f'{title_line}'
        )

        # Collect data for CSV
        csv_data.append({
            'Model': f'{prefix} {variant_name}',
            'Threshold': float(optimal_threshold),
            'Accuracy': accuracy * 100.0,
            'Kappa': kappa,
            'AUC-ROC': roc["auc"],
            'AUC-PRC': ap,
        })

        result_idx += 1

# Save results to CSV with exact column order
df = pd.DataFrame(
    csv_data,
    columns=['Model', 'Threshold', 'Accuracy', 'Kappa', 'AUC-ROC', 'AUC-PRC']
)
csv_filename = resultsTableSummary
df.to_csv(csv_filename, index=False, float_format='%.3f')
print(f'\nResults saved to CSV: {csv_filename}')


## Model for production

In [None]:
%%time

# Retrain the best 2c CatBoost model on the full dataset and save in -model.pkl format

cb_task_type = "GPU" if compDevice == "cuda" else "CPU"

# 1. Load summary table and select best 2c model by AUC-ROC
summary_df = pd.read_csv(resultsTableSummary)
summary_df_2c = summary_df[summary_df["Model"].str.startswith("2c ")].copy()
if summary_df_2c.empty:
    raise ValueError("No 2c models found in results table to select best model from.")
best_row = summary_df_2c.loc[summary_df_2c["AUC-ROC"].idxmax()]
best_model_name = best_row["Model"]          # e.g. "2c e5"
best_vector_name = best_model_name.split()[1]
best_vector_link = get_vectorizer_link(best_vector_name)
best_threshold = float(best_row["Threshold"])  # EER threshold from CSV

print(f"Best model: {best_model_name} with AUC-ROC={best_row['AUC-ROC']:.3f}")
print(f"Vectorizer link: {best_vector_link}")

# Load full dataset (in vectorized format)
X_full = XX[vectors.index(best_vector_name)]
y_full = y.to_numpy() if hasattr(y, "to_numpy") else np.asarray(y)

# PCA transform for dimensionality reduction
if useDimensionalityReduction:
    pca_full = PCA(n_components=num_vars)
    X_full = pca_full.fit_transform(X_full)
else:
    pca_full = None

# Build CatBoost model on the full dataset
pos_count = (y_full == 1).sum()
neg_count = (y_full == 0).sum()
if pos_count == 0:
    raise ValueError("No positive samples in y_full; cannot compute scale_pos_weight.")
imbalance_scalar_full = neg_count / pos_count
X_trn, X_val, y_trn, y_val = train_test_split(X_full, y_full, stratify=y_full, test_size=0.2, random_state=42)
final_model = CatBoostClassifier(iterations=num_tree, learning_rate=0.05, task_type=cb_task_type,
    allow_writing_files=False, eval_metric=eval_kpi, scale_pos_weight=imbalance_scalar_full)
final_model.fit(X_trn, y_trn, eval_set=(X_val, y_val), use_best_model=True,
                early_stopping_rounds=10, metric_period=5, verbose=False)

# Reproduce calibration using validation scores (if enabled)
if useCatBoostOutputCallibration:
    prob_pos_val = final_model.predict_proba(X_val)[:, 1]
    beta_calibrator = BetaCalibration(parameters="abm")
    beta_calibrator.fit(prob_pos_val, y_val)
else:
    beta_calibrator = None

# Package everything for later inference (including EER threshold)
model_package = {
    "dataset": dataset,
    "vectorizer": best_vector_name,
    "vectorizer_link": best_vector_link,
    "pca_transform": pca_full,
    "catboost_model": final_model,
    "beta_calibrator": beta_calibrator,
    "threshold": best_threshold
}

# Save model to results folder with '-model' suffix
model_filename = os.path.join(resultsFolder, f"{dataset}-model.pkl")
joblib.dump(model_package, model_filename)

print(f"Final best model ({best_model_name}) saved to: {model_filename}")