In [None]:
%pip install setfit

In [None]:
import os
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report
from setfit import sample_dataset, SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss, ContrastiveLoss
from datasets import Dataset

### Data Loading And Data Analysis

In [None]:
tweets_df = pd.read_csv("/content/drive/MyDrive/Natural_Language_Processing/data/tweet_emotions.csv")

In [None]:
tweets_df.head()

Unnamed: 0,tweet_id,sentiment,content,Unnamed: 4
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,
2,1956967696,sadness,Funeral ceremony...gloomy friday...,
3,1956967789,enthusiasm,wants to hang out with friends SOON!,
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,


In [None]:
def replace_user_names(text):
    pattern = r"@\w+"
    new_string = re.sub(pattern, "", text)
    return new_string
tweets_df["content"] = tweets_df["content"].apply(replace_user_names)

In [None]:
tweets_df.head()

Unnamed: 0,tweet_id,sentiment,content,Unnamed: 4
0,1956967341,empty,i know i was listenin to bad habit earlier a...,
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,
2,1956967696,sadness,Funeral ceremony...gloomy friday...,
3,1956967789,enthusiasm,wants to hang out with friends SOON!,
4,1956968416,neutral,We want to trade with someone who has Houston...,


In [None]:
# It clearly shows that data set is imbalanced
tweets_df["sentiment"].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

### Encoding the labels

In [None]:
# Encoding the labels
le = LabelEncoder()
le.fit(tweets_df["sentiment"])

In [None]:
tweets_df["encoded_sentiment"] = le.transform(tweets_df["sentiment"])

In [None]:
# Decoding the labels
encoded_data = sorted(tweets_df["encoded_sentiment"].unique())
decoded_values = le.inverse_transform(encoded_data)


### Creating Train, Val, Test datasets

Stratified sampling: This technique is particularly useful for imbalanced datasets, where one class has significantly fewer data points than others. Stratified sampling ensures that each split maintains the original class distribution, preventing bias towards the majority class.

In [None]:
# Load your data (X and y)
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=42)
for train_index, test_index in sss.split(tweets_df["content"], tweets_df["encoded_sentiment"]):
    X_train, X_test = tweets_df["content"][train_index], tweets_df["content"][test_index]
    y_train, y_test = tweets_df["encoded_sentiment"][train_index], tweets_df["encoded_sentiment"][test_index]

In [None]:
# Creating validataion dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [None]:
# Column names 'text' and 'label' are important
train_data = {"text": X_train.to_list(), "label": y_train.to_list()}
train_dataset = Dataset.from_dict(train_data, split='train')

val_data = {"text": X_val.to_list(), "label": y_val.to_list()}
val_dataset = Dataset.from_dict(val_data, split='validation')

test_data = {"text": X_test.to_list(), "label": y_test.to_list()}
test_dataset = Dataset.from_dict(test_data, split='test')

The num_rows attribute in the output indicates the number of rows (samples) in the sampled_train_dataset. In this case, the output indicates that there are 104 samples in the dataset.

In [None]:
sampled_train_dataset = sample_dataset(train_dataset, num_samples = 32)
sampled_train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 416
})

### Train the SetFit Model

In [None]:
#model_id = "sentence-transformers/paraphrase-mpnet-base-v2"
model_id = "sentence-transformers/all-mpnet-base-v2"
model = SetFitModel.from_pretrained(model_id)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [None]:
num_classes = len(sampled_train_dataset.unique("label"))
model = SetFitModel.from_pretrained(model_id, use_differentiable_head=True, head_params={"out_features": num_classes})

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [None]:
trainer = SetFitTrainer(
    model=model,
    train_dataset=sampled_train_dataset,
    eval_dataset=val_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    num_epochs=5
)


In [None]:
trainer.freeze()
trainer.train(body_learning_rate=1e-5, num_epochs=1)

Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 16640
  Num epochs = 1
  Total optimization steps = 1040
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1040 [00:00<?, ?it/s]

Run it on GPUs for quick training

In [None]:
trainer.unfreeze(keep_body_frozen=True)
trainer.train(learning_rate=1e-2, num_epochs=50)

The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 384.


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
metrics = trainer.evaluate()
metrics

***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.20547619047619048}

In [None]:
output = model(X_test.to_list())

In [None]:
y_pred = output.tolist()
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.06      0.01        33
           1       0.04      0.33      0.07        54
           2       0.04      0.17      0.06       248
           3       0.04      0.18      0.06       228
           4       0.13      0.14      0.14       533
           5       0.33      0.16      0.21      1563
           6       0.21      0.23      0.22       397
           7       0.43      0.26      0.33      1152
           8       0.46      0.15      0.23      2591
           9       0.07      0.26      0.11       458
          10       0.31      0.28      0.29      1549
          11       0.08      0.13      0.10       656
          12       0.37      0.21      0.27      2538

    accuracy                           0.20     12000
   macro avg       0.19      0.20      0.16     12000
weighted avg       0.32      0.20      0.23     12000



### Resource

https://github.com/huggingface/setfit/blob/main/notebooks/text-classification.ipynb