In [1]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from setfit import SetFitModel, SetFitTrainer
from transformers import AutoTokenizer

In [2]:
import re
df = pd.read_csv('../data/0_labelled_documents.csv')
df.head()
targets = [x for x in df.columns if re.match("^4 -",x)]
df = df[df["INCLUDE"]==1]
df = df.dropna(subset=targets).replace(2,1)
#targets = targets[0]
df['labels'] = list(df[targets].values.astype(int))

#df = df.head(10)

def datasetify(x, tokenizer, y=None):
    data_dict = {"text": x}
    if y is not None:
        data_dict["label"] = y
    dataset = Dataset.from_dict(data_dict)
    if not tokenizer:
        return dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=False)

    return dataset.map(tokenize_function, batched=True)



In [4]:
new_df = []
for t in targets:
    for i, row in df[df[t]==1].reset_index(drop=True).iterrows():
        new_df.append(row)
        if i > 0:
            break
        
new_df = pd.DataFrame.from_dict(new_df).reset_index(drop=True)
new_df.labels

0    [1, 0, 0, 1, 0]
1    [1, 0, 0, 0, 0]
2    [0, 1, 0, 0, 0]
3    [0, 1, 0, 0, 0]
4    [0, 0, 1, 0, 0]
5    [0, 0, 1, 0, 0]
6    [1, 0, 0, 1, 0]
7    [1, 0, 0, 1, 1]
8    [1, 0, 0, 1, 1]
9    [1, 0, 0, 0, 1]
Name: labels, dtype: object

In [5]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    new_df.content, new_df.labels, test_size=0.2, random_state=42)

tokenizer = None
train_ds = datasetify(X_train, tokenizer, y_train)
test_ds = datasetify(X_train, tokenizer, y_train)

In [6]:
model_name = "sentence-transformers/paraphrase-mpnet-base-v2"

model = SetFitModel.from_pretrained(
    model_name,
    use_differentiable_head=True,
    multi_target_strategy="one-vs-rest",
)


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [7]:
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    #eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=8,
    num_iterations=20, # Number of text pairs to generate for contrastive learning
    num_epochs=1 # Number of epochs to use for contrastive learning
)

In [None]:
# Train and evaluate!
trainer.train()
#metrics = trainer.evaluate()

***** Running training *****
  Num examples = 420
  Num epochs = 1
  Total optimization steps = 53
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/53 [00:00<?, ?it/s]

In [8]:
trainer.model.predict(test_ds["text"])

KeyboardInterrupt: 

In [None]:
labels = train_ds["label"]