### Setup the imports

In [1]:
from datasets import ClassLabel, load_dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer, sample_dataset

### Load the data

In [2]:
dataset = load_dataset("csv", data_files="../data/new_punc_data_tr.csv").shuffle(seed=42)

dataset = dataset['train'].remove_columns(["Unnamed: 0", "title", "src"])

Found cached dataset csv (C:/Users/nashe/.cache/huggingface/datasets/csv/default-36e050e3f0116b44/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at C:\Users\nashe\.cache\huggingface\datasets\csv\default-36e050e3f0116b44\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c644d8fc8a42620e.arrow


### Preprocess

In [3]:
ALGS = ['ctrl', 'fair', 'gpt', 'gpt2', 'grover', 'human', 'pplm', 'xlm', 'xlnet', 'instructgpt', 'gpt3']

In [4]:
# Mapping labels to ids
new_features = dataset.features.copy()
new_features['alg'] = ClassLabel(11, names=ALGS)
dataset = dataset.cast(new_features)

dataset = dataset.train_test_split(test_size=0.85, stratify_by_column='alg')

Loading cached processed dataset at C:\Users\nashe\.cache\huggingface\datasets\csv\default-36e050e3f0116b44\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b0af12c104737818.arrow


In [5]:
dataset['train'].features

{'generation': Value(dtype='string', id=None),
 'alg': ClassLabel(names=['ctrl', 'fair', 'gpt', 'gpt2', 'grover', 'human', 'pplm', 'xlm', 'xlnet', 'instructgpt', 'gpt3'], id=None)}

In [6]:
# Sample a few samples for few shot training

train_dataset = sample_dataset(dataset['train'], label_column="alg", num_samples=40)
eval_dataset = dataset['test']

In [7]:
len(train_dataset)

440

### Load the model

In [8]:
model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-mpnet-base-v2",
    use_differentiable_head=True,
    head_params={"out_features": 11},
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)f39ef/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)0182ff39ef/README.md:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading (…)82ff39ef/config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)f39ef/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)0182ff39ef/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)2ff39ef/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [9]:
model.model_head

SetFitHead({'in_features': 768, 'out_features': 11, 'temperature': 1.0, 'bias': True, 'device': 'cuda'})

### Train the model

In [10]:
metric_kwargs = {
    'average': 'macro'
}

trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    metric="f1",
    metric_kwargs=metric_kwargs,
    batch_size=4,
    num_iterations=20, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for contrastive learning
    column_mapping={"generation": "text", "alg": "label"} # Map dataset columns to text/label expected by trainer
)

In [11]:
# Train and evaluate!
trainer.train()

Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 17600
  Num epochs = 1
  Total optimization steps = 4400
  Total train batch size = 4


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4400 [00:00<?, ?it/s]

In [12]:
metrics = trainer.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.060858842316022725}