# Optional : replace LogisticRegression head and quick/partial fix out of memory errors (OOM)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict, load_dataset
from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset

from sentence_transformers.losses import CosineSimilarityLoss

import torch
import gc

from optuna import Trial

## Load data

#### Load from disk

In [None]:
filepath = "data/lmd_ukraine_annotated.parquet"

In [None]:
data = pd.read_parquet(filepath)
display(data.head(3))
print(data.dtypes)

In [None]:
# For later stage and to comply with huggingface Dataset format, convert article_type to string type
data['article_type'] = data['article_type'].astype(str)

#### Classes overview / % annotated labels

In [None]:
print(len(data))
print(data.classe.value_counts())
print(sum(data.classe.notnull()))
print(sum(data.classe.isnull()))

## Prepare Dataset (labels, optional sample, split)

#### Split, convert to Huggingface DatasetDict

In [None]:
# select labeled data only to split between train and eval, test set is the unlabeled data.
with_labels = data.query("classe.notnull()")
test_df = data.query("classe.isnull()")
print(len(with_labels), len(test_df))

In [None]:
# labeled data is split between train and eval sets
# Optional stratify= but we still want to make sure classes are "balanced" in both dataset

train_df, eval_df = train_test_split(with_labels, test_size=0.4, stratify=with_labels['classe'], random_state=40)

In [None]:
# we make sure the smaller class has enough labels (e.g 8, or 20 or 50 or "max" 100).of
# This dataset will later be sampled again using Setfit.sample_dataset. Classes will have the same amount of rows (8 or 10 or 60...)
print(len(train_df))
print(train_df.classe.value_counts())

In [None]:
print(len(eval_df))
eval_df.classe.value_counts()

In [None]:
# For labeled data, add a 'label' column where 'classe' labels strings -> int
# We do it now, because we SetFit wants integers and not floats for training
label_mapping = {'pro_ukraine': 0, 'pro_russia': 1, 'other': 2}
for df in [train_df, eval_df]:
    df['label'] = df['classe'].map(label_mapping)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# convert to huggingface --commonly used, DatasetDict format
dataset = DatasetDict({
    'train': train_dataset,
    'validation': eval_dataset,
    'test': test_dataset
})

In [None]:
# save # classes, to be used later when loading model
num_classes = len(train_dataset.unique("label"))
num_classes

## Modeling/HPO : replace LogisticRegression head by GradientBoosting (sklearn)

Setfit docs recommends the sklearn logistic regression head though (see option B.). Performs a bit better in our use case too.  
Here, by specifying use_differentiable_head=True, `SetFitHead`, a custom torch classification head is used.  
To use your own custom classification head see [here](https://huggingface.co/docs/setfit/how_to/classification_heads).  

#### Load Model

In [None]:
model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    use_differentiable_head=True, head_params={"out_features": num_classes})
model.model_head

In [None]:
model.labels = ["pro_ukraine", "pro_russia", "other"]

#### Set Trainer args

Might try to play on [sampling_strategy](https://huggingface.co/docs/setfit/v1.0.0/en/reference/trainer#setfit.TrainingArguments) (i.e undersampling or unique) for minority class "pro_russian".  
From SetFit doc, num_epochs, max_steps and body_learning_rate are the most important regarding phase 1.  

In [None]:
# customize your training arguments here, setfit.TrainingArguments class
# tuples correspond to steps 1. finetuning embedding, 2. training classification head
args = TrainingArguments(
    batch_size=(32, 16), # default is (16,2), second value is for the classification head (SetFitHead)
    num_epochs=(1, 16), # default (1, 16)
    end_to_end=True, # if False (default), freezes body and train Head only. If True train the entire model during the classi. phase.
    body_learning_rate=(2e-5, 1e-5), # (2e-5, 1e-5) by default. Only used if end to end is True (else body is frozen)
    head_learning_rate=2e-3, # default 1e-2
    l2_weight=0.01, # optional weight model body & head, passed to AdamW optimizer in classification training
    sampling_strategy='oversampling', # default is oversampling. Kinda replace --but still exist, the num_iterations args
    max_steps=-1 # default -1. Can also overrides num_epochs and reduce the # steps that would be otherwise needed.
)

In [None]:
# Instanciate the trainer.  
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric='accuracy', #default
    column_mapping={"comment": "text", "label": "label"}, # cols expected by the model   
)

#### Fine-tune (embeddings, classifier)

In Setfit version >= 1, no need to freeze/unfreeze the head, the two steps 1.fine tune embeddings 2. classifier are done automatically

In [None]:
# train
trainer.train()

#### Evaluate

In [None]:
trainer.evaluate()

In [None]:
# mpnet
# NO sample_dataset, samp_strat='oversampling', batch_size=(32, 16), num_epochs=(1, 16), body_learning_rate=(2e-5, 1e-5), l2_weight=0.01
# result : 66%

# NO sample_dataset, samp_strat='undersampling', batch_size=(16, 2), num_epochs=(*2*, 16), body_learning_rate=(2e-5, 1e-5), l2_weight=None
# result : 64.7

Recommended method over SetFitHead (SetFit customized torch classifier head).  
Previous differentiable head params was removed.  
Additional params can be specified using `head_params` or a customized head can implemented manually (cf. doc).  
Check sklearn [LogisticRegression module](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) for more params (solver, max_iter, class_weight etc.)

#### Load model

In [None]:
# Just to accelerate our tests : 16 examples per class, -> 48 rows -> x generated examples for contrastive learning
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=16, seed=40)
train_dataset

In [None]:
model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    head_params={
        "solver": "liblinear", # default is liblinear. Other choices :  lbfgs, saga, newton-cg etc.
        "max_iter": 250, # default is 100
        "class_weight": None  # default None, try 'balanced'
    }
)

#### Trainer

In [None]:
args = TrainingArguments(
    batch_size=16,
    num_epochs=1,
    body_learning_rate = 2e-5, #2e-5,
    #evaluation_strategy="steps",
    #eval_steps = 250,
    #save_strategy="steps",
    #save_steps=250,
    #load_best_model_at_end=True,
    sampling_strategy='oversampling', # default is oversampling. Kinda replace --but still exist, the num_iterations args
    max_steps=-1 # default -1 (all). Can also overrides num_epochs and reduce the # steps that would be otherwise needed.
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    column_mapping={"comment": "text", "label": "label"},
)

#### Fine-tune

In [None]:
trainer.train()

#### Evaluate

In [None]:
metrics = trainer.evaluate()
metrics

In [None]:
# ----- mpnet
# liblinear, max_iter 300, batch sz 16, epoch 1, max_steps = -1 (4k)
# result = 65,2%

# mpnet
# liblinear, max_iter 300, batch sz 16, epoch 1, max_steps = 1000
# result = 68%

# mpnet
# liblinear, max_iter 300, batch sz 16, epoch 1, max_steps = 1500
# result = 64 %

# ----- dang camembert large
# liblinear, max_iter 300, batch sz 2 (37k steps!), epoch 1, max_steps = -1
# result = 44,8%

# liblinear, max_iter 300, batch sz 3 , epoch 1, max_steps = 1000
# result = 59%

# ----- dang camembert BASE
# liblinear, max_iter 300, batch sz 8, epoch 1, max_steps = 2500, body_learning_rate = 2e-5
# result = 65,2%

# liblinear, max_iter 200, batch sz 16, epoch 1, max_steps = 1000, body_learning_rate = 2e-5
# result = 60%

# liblinear, max_iter 300, batch sz 16, epoch 1, max_steps = 3000, body_learning_rate = 2e-5
# result = 64,8%

# *lbfgs*, max_iter 300, batch sz 16, epoch 1, max_steps = 3000, body_learning_rate = 2e-5
# result = 64,8%

# *lbfgs*, max_iter 500, batch sz 16, epoch 1, max_steps = 1000, body_learning_rate = 1e-5
# result = 64,8%

# *lbfgs*, max_iter 300, batch sz 16, epoch 1, max_steps = 1000, body_learning_rate = 1e-5, "class_weight": 'balanced'
# result = 61,2%

# ----- mpnet (again)
# *newton-cg*, max_iter 300, batch sz 16, epoch 1, max_steps = 1000, body_learning_rate = 1e-5
# result : 63,9%

# *liblinear*, max_iter 300, batch sz 16, epoch 1, max_steps = 1000, body_learning_rate = 1e-5
# result : 65,2%

# *liblinear*, max_iter 300, batch sz 16, epoch 1, max_steps = 750, body_learning_rate = 2e-5
# result :65,2 %

In [None]:
from optuna import Trial

# Optional, but for test purposes 8 ex. per class
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=8, seed=40)

def model_init(params):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", **params)

def hp_space(trial):
    """ Define hyperparams search space (Optuna) """
    
    return {
        # Embeddings fine-tuning phase params :
        
        "body_learning_rate": trial.suggest_float("body_learning_rate", 1e-6, 1e-3, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 1, 3),
        "batch_size": trial.suggest_categorical("batch_size", [16, 32]),
        "seed": trial.suggest_int("seed", 1, 40),
        
        # LogisticRegression head params :
        
        "max_iter": trial.suggest_int("max_iter", 50, 300),
        "solver": trial.suggest_categorical("solver", ["newton-cg", "liblinear","lbfgs"]),
    }

trainer = Trainer(
    model_init=model_init,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    column_mapping={"comment": "text", "label": "label"},
)
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=4)

See [issue 1](https://github.com/huggingface/setfit/issues/311) and [issue 2](https://github.com/huggingface/transformers/issues/13019)

In [None]:
# Optional, but for test purposes 8 ex. per class
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=70, seed=40)

In [None]:
import gc
import torch
from optuna import Trial
from setfit import Trainer, SetFitModel, sample_dataset
import time

# Model initialization function
def model_init(params):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    # memory management
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(2)

    return SetFitModel.from_pretrained("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", **params)

In [None]:
# Hyperparameter space definition
def hp_space(trial):
    """ Define hyperparams search space (Optuna) """
    
    return {
        # Embeddings fine-tuning phase params :
        
        "body_learning_rate": trial.suggest_float("body_learning_rate", 1e-07 , 3e-06, log=True), #1e-7 , 1e-5 oldest : # 1e-6, 1e-3
        # "num_epochs": trial.suggest_int("num_epochs", 1, 2),
        "max_steps": trial.suggest_int("max_steps", 650, 900), # 200, 900
        "batch_size": trial.suggest_categorical("batch_size", [16]),
        "seed": trial.suggest_int("seed", 1, 40),
        
        # LogisticRegression head params :
        
        "max_iter": trial.suggest_int("max_iter", 120, 130), # 100, 200
        "solver": trial.suggest_categorical("solver", ["liblinear"]), # "newton-cg",'lbfgs'
    }

In [None]:
# Customized run_hp_search_optuna function
def run_hp_search_optuna_modified(trainer, n_trials, direction, **kwargs):
    import optuna

    def _objective(trial):
        trainer.objective = None
        trainer.train(trial=trial)
        
        # memory management
        del trainer.model
        gc.collect()
        torch.cuda.empty_cache()
        
        time.sleep(2)

        # Evaluate if needed
        if getattr(trainer, "objective", None) is None:
            metrics = trainer.evaluate()
            trainer.objective = trainer.compute_objective(metrics)
        
        time.sleep(1)
        
        return trainer.objective

    timeout = kwargs.pop("timeout", None)
    n_jobs = kwargs.pop("n_jobs", 1)
    study = optuna.create_study(direction=direction, **kwargs)

    # memory management : overkill, but also adding gc_after_trial=True in study.optimize()
    study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs, gc_after_trial=True)
    best_trial = study.best_trial
    return BestRun(str(best_trial.number), best_trial.value, best_trial.params, study)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model_init=model_init,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    column_mapping={"comment": "text", "label": "label"},
)

In [None]:
# Replace the run_hp_search_optuna method with the modified one
trainer.run_hp_search_optuna = run_hp_search_optuna_modified

In [None]:
# Run hyperparameter search
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=5)

In [None]:
print(best_run)

#### results

In [None]:
# MPNET

# samples =  70 (total : 1838 examples, max steps range : 200, 900)

# Trial 1 finished with value: 0.6347826086956522 and parameters: {'body_learning_rate': 3.303673112561539e-06, 'max_steps': 753, 'batch_size': 16, 'seed': 9, 'max_iter': 183, 'solver': 'liblinear'}
# Trial 0 finished with value: ---> 0.6739130434782609 and parameters: {'body_learning_rate': 2.5030305744282964e-06, 'max_steps': 690, 'batch_size': 16, 'seed': 13, 'max_iter': 125, 'solver': 'liblinear'}
# and after refinements/runs around trial 0 with 67% 
# Trial 2 finished with value: ---> 0.6869565217391305 and parameters: {'body_learning_rate': 1.845176533146184e-07, 'max_steps': 653, 'batch_size': 16, 'seed': 16, 'max_iter': 121, 'solver': 'liblinear'}

# samples =  NO SAMPLE (total : 2625 examples, max steps range : 650, 800, learning rate 1e-7 , 1e-5, max_iter : 120, 126)

# Trial 2 finished with value: 0.6826086956521739 and parameters: {'body_learning_rate': 1.6752640093810652e-06, 'max_steps': 791, 'batch_size': 16, 'seed': 36, 'max_iter': 123, 'solver': 'liblinear'}.




In [None]:
# Optional, but for test purposes 8 ex. per class
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=70, seed=40)

In [None]:
import gc
import torch
from optuna import Trial
from setfit import Trainer, SetFitModel, sample_dataset
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import time


# Model initialization function for RandomForestClassifier
def model_init(params):
    params = params or {}
    max_depth = params.get("max_depth", 2)
    n_estimators = params.get("n_estimators", 100)
    random_state = params.get("random_state", 0)
    params = {
        "head_params": {
            "max_depth": max_depth,
            "random_state": random_state,
        }
    }
    # memory management
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(2)

    model_body = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    model_head = RandomForestClassifier(**params["head_params"])

    return SetFitModel(
        model_body=model_body,
        model_head=model_head,
        # multi_target_strategy=None,
        # l2_weight=1e-2,
    )

In [None]:
# Hyperparameter space definition
def hp_space(trial):
    """ Define hyperparams search space (Optuna) """
    
    return {
        # Embeddings fine-tuning phase params :
        
        "body_learning_rate": trial.suggest_float("body_learning_rate", 1e-7, 1e-5, log=True),
        # "num_epochs": trial.suggest_int("num_epochs", 1, 2),
        "max_steps": trial.suggest_int("max_steps", 650, 900), # 200, 900
        "batch_size": trial.suggest_categorical("batch_size", [16]),
        "seed": trial.suggest_int("seed", 1, 40),
        
        # RandomForest head params :
        
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 30),
        
    }

In [None]:
# Customized run_hp_search_optuna function
def run_hp_search_optuna_modified(trainer, n_trials, direction, **kwargs):
    import optuna

    def _objective(trial):
        trainer.objective = None
        trainer.train(trial=trial)
        
        # memory management
        del trainer.model
        gc.collect()
        torch.cuda.empty_cache()
        
        time.sleep(1)

        # Evaluate if needed
        if getattr(trainer, "objective", None) is None:
            metrics = trainer.evaluate()
            trainer.objective = trainer.compute_objective(metrics)
        
        time.sleep(1)
        
        return trainer.objective

    timeout = kwargs.pop("timeout", None)
    n_jobs = kwargs.pop("n_jobs", 1)
    study = optuna.create_study(direction=direction, **kwargs)

    # memory management : overkill, but also adding gc_after_trial=True in study.optimize()
    study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs, gc_after_trial=True)
    best_trial = study.best_trial
    return BestRun(str(best_trial.number), best_trial.value, best_trial.params, study)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model_init=model_init,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    column_mapping={"comment": "text", "label": "label"},
)

In [None]:
# Replace the run_hp_search_optuna method with the modified one
trainer.run_hp_search_optuna = run_hp_search_optuna_modified

In [None]:
# Run hyperparameter search
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=5)

In [None]:
# MPNET, randomforest

# samples =  70 (total : 1838 examples, max steps range : 200, 900
# Trial 0 finished with value: 0.6521739130434783 and parameters: {'body_learning_rate': 6.708138590154178e-07, 'max_steps': 705, # 'batch_size': 16, 'seed': 22, 'n_estimators': 325, 'max_depth': 19}.



Also, some memory management to prevent out of memory error

In [None]:
# Optional, but for test purposes 8 ex. per class
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=70, seed=40)

In [None]:
import gc
import torch
from optuna import Trial
from setfit import Trainer, SetFitModel, sample_dataset
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
import time


# Model initialization function for sklearn "lightgbm" (histGrad) or classic GradientBoostingClassifier
def model_init(params):
    params = params or {}
    max_depth = params.get("max_depth", 2)
    n_estimators = params.get("n_estimators", 100)
    learning_rate = params.get("learning_rate", 0.1)
    random_state = params.get("random_state", 0)
    params = {
        "head_params": {
            "max_depth": max_depth,
            "learning_rate": learning_rate,
            "random_state": random_state,
        }
    }
    # memory management
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(2)

    model_body = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    model_head = GradientBoostingClassifier(**params["head_params"])

    return SetFitModel(
        model_body=model_body,
        model_head=model_head,
    )

In [None]:
# Hyperparameter space definition
def hp_space(trial):
    """ Define hyperparams search space (Optuna) """
    
    return {
        # Embeddings fine-tuning phase params :
        
        #"body_learning_rate": trial.suggest_float("body_learning_rate", 1e-7, 1e-5, log=True),
        # "num_epochs": trial.suggest_int("num_epochs", 1, 2),
        "max_steps": trial.suggest_int("max_steps", 650, 1100), # 200, 900
        #"batch_size": trial.suggest_categorical("batch_size", [16]),
        "seed": trial.suggest_int("seed", 1, 40),
        
        # classifier head params :
        
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.01, 0.1, 0.2]),
        
    }

In [None]:
# Customized run_hp_search_optuna function
def run_hp_search_optuna_modified(trainer, n_trials, direction, **kwargs):
    import optuna

    def _objective(trial):
        trainer.objective = None
        trainer.train(trial=trial)
        
        # memory management
        del trainer.model
        gc.collect()
        torch.cuda.empty_cache()
        
        time.sleep(1)

        # Evaluate if needed
        if getattr(trainer, "objective", None) is None:
            metrics = trainer.evaluate()
            trainer.objective = trainer.compute_objective(metrics)
        
        time.sleep(1)
        
        return trainer.objective

    timeout = kwargs.pop("timeout", None)
    n_jobs = kwargs.pop("n_jobs", 1)
    study = optuna.create_study(direction=direction, **kwargs)

    # memory management : overkill, but also adding gc_after_trial=True in study.optimize()
    study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs, gc_after_trial=True)
    best_trial = study.best_trial
    return BestRun(str(best_trial.number), best_trial.value, best_trial.params, study)

In [None]:
args = TrainingArguments(
    batch_size=16,
    num_epochs=1,
    body_learning_rate = 1.845176533146184e-07,
    #evaluation_strategy="steps",
    #eval_steps = 250,
    #save_strategy="steps",
    #save_steps=250,
    #load_best_model_at_end=True,
    sampling_strategy='oversampling', # default is oversampling. Kinda replace --but still exist, the num_iterations args
    # max_steps=-1 # default -1 (all). Can also overrides num_epochs and reduce the # steps that would be otherwise needed.
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    column_mapping={"comment": "text", "label": "label"},
)

In [None]:
# Replace the run_hp_search_optuna method with the modified one
trainer.run_hp_search_optuna = run_hp_search_optuna_modified

In [None]:
# Run hyperparameter search
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=7)