In [None]:
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Data Preprocessing

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD



In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')
data_path = "drive/MyDrive/WhereOrderBot/data/raw-dataset.xlsx"
dataset = pd.read_excel(data_path)
dataset.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,category,email
0,where_is_my_order,"Hello, just checking in on the status of my or..."
1,where_is_my_order,It's getting frustrating not knowing where my ...
2,other,"Hi there, I am organizing a charity event for ..."
3,where_is_my_order,"Hey, I recently ordered a custom mechanical ke..."
4,where_is_my_order,"Hello, I haven't received any updates regardin..."


In [None]:
# define label
labels = {
    'feedback' : 0,
    'general_enquiry' : 1,
    'other' : 2,
    'refund_question' : 3,
    'spam' : 4,
    'where_is_my_order' : 5
}

# convert category to label
def label_encoder(category:str) -> int :
    return labels[category]

# add label column with categorical-label mapping
dataset['label'] = dataset['category'].apply(lambda label: label_encoder(label))

# rename email column to question
dataset.rename(columns = {'email':'question'}, inplace = True)
dataset.head()

Unnamed: 0,category,question,label
0,where_is_my_order,"Hello, just checking in on the status of my or...",5
1,where_is_my_order,It's getting frustrating not knowing where my ...,5
2,other,"Hi there, I am organizing a charity event for ...",2
3,where_is_my_order,"Hey, I recently ordered a custom mechanical ke...",5
4,where_is_my_order,"Hello, I haven't received any updates regardin...",5


In [None]:
import numpy as np
seed_number=1
np.random.seed(seed_number) # set same seed for keeping same result
df_train, df_val, df_test = np.split(dataset.sample(frac=1, random_state=seed_number),
                                     [int(.8*len(dataset)), int(.9*len(dataset))])

print(len(df_train),len(df_val), len(df_test))

266 33 34


In [None]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = list(df['label'])
        self.texts = [tokenizer(text,
                                padding='max_length',
                                max_length = 512,
                                truncation=True,
                                return_tensors="pt")
                      for text in df['question']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from torch.optim import Adam

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

In [None]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 133/133 [25:57<00:00, 11.71s/it]


Epochs: 1 | Train Loss:  0.842                 | Train Accuracy:  0.346                 | Val Loss:  0.791                 | Val Accuracy:  0.455


100%|██████████| 133/133 [25:44<00:00, 11.61s/it]


Epochs: 2 | Train Loss:  0.760                 | Train Accuracy:  0.444                 | Val Loss:  0.708                 | Val Accuracy:  0.545


100%|██████████| 133/133 [25:46<00:00, 11.62s/it]


Epochs: 3 | Train Loss:  0.684                 | Train Accuracy:  0.515                 | Val Loss:  0.687                 | Val Accuracy:  0.545


100%|██████████| 133/133 [25:43<00:00, 11.60s/it]


Epochs: 4 | Train Loss:  0.624                 | Train Accuracy:  0.583                 | Val Loss:  0.520                 | Val Accuracy:  0.636


100%|██████████| 133/133 [25:07<00:00, 11.34s/it]


Epochs: 5 | Train Loss:  0.557                 | Train Accuracy:  0.617                 | Val Loss:  0.431                 | Val Accuracy:  0.758


# Using Setfit

In [None]:
%pip install setfit[optuna]==0.3.0 datasets -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting setfit[optuna]==0.3.0
  Downloading setfit-0.3.0-py3-none-any.whl (21 kB)
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers==2.2.2
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.3/362.3 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate==0.2.2
  Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Froma pandas to Dataset HF

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(df_train)
train_dataset

val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)
test_dataset

Dataset({
    features: ['category', 'question', 'label', '__index_level_0__'],
    num_rows: 34
})

## Training Setfit

In [None]:
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss

# Load a SetFit model from Hub
model_id = "sentence-transformers/all-mpnet-base-v2"
model = SetFitModel.from_pretrained(model_id)

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=4,
    num_iterations=5, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for constrastive learning
    column_mapping={"question":"text", "label" : "label"}
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate()

print(f"model used: {model_id}")
print(f"train dataset: {len(train_dataset)} samples")
print(f"accuracy: {metrics['accuracy']}")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 2660
  Num epochs = 1
  Total optimization steps = 665
  Total train batch size = 4


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/665 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

***** Running evaluation *****


model used: sentence-transformers/all-mpnet-base-v2
train dataset: 266 samples
accuracy: 1.0


In [None]:
metrices = trainer.evaluate()

Applying column mapping to evaluation dataset
***** Running evaluation *****


In [None]:
metrices

{'accuracy': 1.0}

In [None]:
# save model
import pickle
trained_path="drive/MyDrive/WhereOrderBot/model/setfit_model_finetuned.pickle"
filehandler = open(trained_path, "wb")
pickle.dump(model, filehandler)
filehandler.close()

#Test Model

In [None]:
# load model
import pickle
trained_path="drive/MyDrive/WhereOrderBot/model/setfit_model_finetuned.pickle"
file = open(trained_path,'rb')
intent_model = pickle.load(file)
file.close()

In [None]:
LABEL_TO_INTENT_CLASS = {
    0: 'feedback',
    1: 'general_enquiry',
    2: 'other',
    3: 'refund_question',
    4: 'spam',
    5: 'where_is_my_order',
}

In [None]:
prompt = """
I was excited to receive my new keyboard, but it arrived with scratches and dents. This is unacceptable.
"""

In [None]:
LABEL_TO_INTENT_CLASS[intent_model([prompt]).item()]

'feedback'