# Few shots classsification using SetFit

In [1]:
!pip install setfit
# !pip install huggingface_hub

Collecting setfit
  Obtaining dependency information for setfit from https://files.pythonhosted.org/packages/a4/b0/0afe7c5e0901fece8677746a70f9658c8c7c55dc46c9c947e473c7ed9d77/setfit-1.0.1-py3-none-any.whl.metadata
  Downloading setfit-1.0.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=2.3.0 (from setfit)
  Obtaining dependency information for datasets>=2.3.0 from https://files.pythonhosted.org/packages/ec/93/454ada0d1b289a0f4a86ac88dbdeab54921becabac45da3da787d136628f/datasets-2.16.1-py3-none-any.whl.metadata
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting sentence-transformers>=2.2.1 (from setfit)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
[?25hCollecting evaluate>=0.3.0 (from setfit)
  Obtaining dependency information for evaluate>=0.3.0 from 

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)

import torch
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict, load_dataset
from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset
from sentence_transformers.losses import CosineSimilarityLoss


In [3]:
# wandb login enabled by default in SetFit, if installed
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
wandb_token = user_secrets.get_secret("wandb_key") 
wandb.login(key=wandb_token)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Load data

In [4]:
# filepath = "data/lmd_ukraine_annotated.parquet"
filepath = "/kaggle/input/lmd-annotated/lmd_ukraine_annotated.parquet"
data = pd.read_parquet(filepath)
print(data.dtypes)
display(data.head(3))

article_id           int64
url                 object
title               object
desc                object
content             object
date                object
keywords            object
article_type      category
allow_comments        bool
premium               bool
author              object
comment             object
comment_id          object
classe              object
dtype: object


Unnamed: 0,article_id,url,title,desc,content,date,keywords,article_type,allow_comments,premium,author,comment,comment_id,classe
0,3259703,https://www.lemonde.fr/actualite-medias/articl...,"Le conflit russo-ukrainien, qui mobilise les m...",Au Festival de journalisme de Couthures : la g...,Parce qu’elle est revenue frapper à nos porte...,2022-07-16,"[international, europe, ukraine, crise-ukraini...",Factuel,True,False,Ricardo Uztarroz,La question qui vaille et qui n'est pas posée...,e7206b56918f694f,pro_russia
1,3259703,https://www.lemonde.fr/actualite-medias/articl...,"Le conflit russo-ukrainien, qui mobilise les m...",Au Festival de journalisme de Couthures : la g...,Parce qu’elle est revenue frapper à nos porte...,2022-07-16,"[international, europe, ukraine, crise-ukraini...",Factuel,True,False,Ricardo Uztarroz,Salandre : les documents dont vous faîtes ét...,d904e44906dfb957,other
2,3259703,https://www.lemonde.fr/actualite-medias/articl...,"Le conflit russo-ukrainien, qui mobilise les m...",Au Festival de journalisme de Couthures : la g...,Parce qu’elle est revenue frapper à nos porte...,2022-07-16,"[international, europe, ukraine, crise-ukraini...",Factuel,True,False,Correcteur,« C’est l’affaire des russes »? C’est donc vot...,1c03f54daeffd1ca,pro_ukraine


In [5]:
# Classes overview / % annotated labels
print(len(data))
print(data.classe.value_counts())
print(sum(data.classe.notnull()))
print(sum(data.classe.isnull()))

175353
classe
other          256
pro_ukraine    196
pro_russia     122
Name: count, dtype: int64
574
174779


## Prepare Dataset (labels, optional sample, split)

In [6]:
# For later stages and to comply with huggingface Dataset format, convert article_type to string type
data['article_type'] = data['article_type'].astype(str)
# Rename "classe" column to "label_text"
data = data.rename({'classe': 'label_text'}, axis=1)

In [7]:
# Labeled data is split between train and eval
# Test set will be the unlabeled data ; will be used later for distillation
with_labels = data.query("label_text.notnull()")
test_df = data.query("label_text.isnull()")
print(len(with_labels), len(test_df))

train_df, eval_df = train_test_split(with_labels, test_size=0.4, stratify=with_labels['label_text'], random_state=40)

print(len(train_df))
print(train_df.label_text.value_counts())
print(len(eval_df))
print(eval_df.label_text.value_counts())

574 174779
344
label_text
other          153
pro_ukraine    118
pro_russia      73
Name: count, dtype: int64
230
label_text
other          103
pro_ukraine     78
pro_russia      49
Name: count, dtype: int64


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [8]:
# For labeled data, add a 'label' column where 'label_text' str -> int
# We do it now, because we SetFit wants integers and not floats for training
label_mapping = {'pro_ukraine': 0, 'pro_russia': 1, 'other': 2}
for df in [train_df, eval_df]:
    df['label'] = df['label_text'].map(label_mapping)

In [9]:
# convert to huggingface --commonly used, DatasetDict format
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
test_dataset = Dataset.from_pandas(test_df)

# to DatasetDict format
dataset = DatasetDict({
    'train': train_dataset,
    'validation': eval_dataset,
    'test': test_dataset
})

# save # classes, to be used later when loading model
num_classes = len(train_dataset.unique("label"))
num_classes

  if _pandas_api.is_sparse(col):


3

## Modeling, using *Sklearn LogisticRegression* head

Note : our own tests and also by the authors, LogisticRegression gives better results than a differentiable, torch head.  
Model, classification head type (rforest, GBM...) and params, hyperparameters were chosen after multiple experiments.  
See hyperparameters optimization notebook.

In [10]:
# Optional : sample dataset, X number of examples per class
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=72, seed=40)

  if _pandas_api.is_sparse(col):


In [11]:
model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    labels=["pro_ukraine", "pro_russia", "other"],
    head_params={
        "solver": "liblinear",
        "max_iter": 136
    }
)

  self.comm = Comm(**args)


config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [12]:
args = TrainingArguments(
    batch_size=32,
    body_learning_rate=4.378056750692589e-07,
    num_epochs=1,
    max_steps=379,
    sampling_strategy='oversampling',
    evaluation_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps = 50,
    save_total_limit = 2,
    report_to = 'wandb',
    run_name = 'setfit_optimized',
    load_best_model_at_end=True,
)

In [13]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    column_mapping={"comment": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
)

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/216 [00:00<?, ? examples/s]

In [14]:
trainer.train()

***** Running training *****
  Num examples = 972
  Num epochs = 1
  Total optimization steps = 379
  Total train batch size = 32
[34m[1mwandb[0m: Currently logged in as: [33mvionmatthieu[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240108_150040-sj6e25t1[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msetfit_optimized[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vionmatthieu/setfit[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/vionmatthieu/setfit/runs/sj6e25t1[0m


Step,Training Loss,Validation Loss,Embedding Loss,Rate
25,No log,No log,0.2796,0.0
50,No log,No log,0.2761,0.0
75,No log,No log,0.2735,0.0
100,No log,No log,0.2711,0.0
125,No log,No log,0.2695,0.0
150,No log,No log,0.2687,0.0
175,No log,No log,0.2682,0.0
200,No log,No log,0.2673,0.0
225,No log,No log,0.2667,0.0
250,No log,No log,0.2665,0.0


  self.comm = Comm(**args)


  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]

Loading best SentenceTransformer model from step 350.


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

## Evaluate

In [15]:
metrics = trainer.evaluate(eval_dataset)
print(metrics)

Applying column mapping to the evaluation dataset
***** Running evaluation *****
  self.comm = Comm(**args)


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

  self.comm = Comm(**args)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.691304347826087}


## Export model to huggingface hub

In [16]:
filepath_model = "gentilrenard/setfit-paraphrase-multi-mpnet-base-v2-lemonde"

In [17]:
# optional push model to the hub

from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_key") 

trainer.push_to_hub(filepath_model, use_auth_token=hf_token)

  self.comm = Comm(**args)


Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model_head.pkl:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

'https://huggingface.co/gentilrenard/setfit-paraphrase-multi-mpnet-base-v2-lemonde/tree/main/'

## Load from hub / inference

In [18]:
# Download from Hub
model = SetFitModel.from_pretrained(filepath_model)

config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/17.4k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

config_setfit.json:   0%|          | 0.00/103 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model_head.pkl:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_setfit.json:   0%|          | 0.00/103 [00:00<?, ?B/s]

model_head.pkl:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

In [19]:
# Run inference
preds = model.predict(
    [
        "La Russie va gagner",
        "les journalistes sont corrompus",
        "il faut soutenir l'Ukraine"
    ]
)
print(preds)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['pro_russia', 'other', 'pro_russia']
