### citation dataset: Banking77

    @inproceedings{Casanueva2020,
        author      = {I{\~{n}}igo Casanueva and Tadas Temcinas and Daniela Gerz and Matthew Henderson and Ivan Vulic},
        title       = {Efficient Intent Detection with Dual Sentence Encoders},
        year        = {2020},
        month       = {mar},
        note        = {Data available at https://github.com/PolyAI-LDN/task-specific-datasets},
        url         = {https://arxiv.org/abs/2003.04807},
        booktitle   = {Proceedings of the 2nd Workshop on NLP for ConvAI - ACL 2020}
    }

In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW
import random
from IPython.display import clear_output
from utils import create_supervised_pair, supervised_contrasive_loss, Similarity

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset('banking77')

Using custom data configuration default
Reusing dataset banking77 (C:\Users\damia\.cache\huggingface\datasets\banking77\default\1.1.0\aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})


In [5]:
train_dataset = dataset["train"]
print(train_dataset)
print(len(train_dataset))

Dataset({
    features: ['text', 'label'],
    num_rows: 10003
})
10003


#### Split Train and validation 

In [6]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_dataset, train_size = 0.8)

In [7]:
train_text = train["text"]
train_label = train["label"]
print(train_text[:3])
print(sorted(set(train_label)))

["What happened to my top-up? It was all done and now it's gone! Are you having problems with your system?", "Why haven't I seen the cash from the cheque I deposited yet?", 'I will not be able to verify my identity.']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


In [8]:
val_text = val["text"]
val_label = val["label"]
print(val_text[:3])
print(sorted(set(val_label)))

['Could I open an account for children?', "I'd like to open an account for my children. How can I do that?", 'I think there is a mistake.  I am being charged twice.']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


In [9]:
test_dataset = dataset["test"]
print(train_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 10003
})


In [10]:
test_text = test_dataset["text"]
test_label = test_dataset["label"]
print(test_text[:3])
print(sorted(set(test_label)))

['How do I locate my card?', 'I still have not received my new card, I ordered over a week ago.', 'I ordered a card but it has not arrived. Help please!']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


#### Dataloader

In [11]:
class CustomTextDataset(Dataset):
    def __init__(self,labels,text,batch_size,repeated_label:bool=False):
        self.labels = labels
        self.text = text
        self.batch_size = batch_size 
        self.count = 0 
        self.repeated_label = repeated_label        
        # to use when training with supervise contrastive loss
        if self.repeated_label == True:
           # write the code here
            self.exist_classes = [] 
            self.label_maps = None 
            self.ids_maps = []
            self.len_data = len(self.labels)
            self.count_batch = 0 
            self.is_left_batch = False
            #print("self.len_data ",self.len_data)
            #print("self.len data",self.batch_size)
            self.max_count = self.len_data // self.batch_size 
            if self.len_data % self.batch_size !=0:
                self.max_count += 1 
            print("the number of maximum of batching :",self.max_count)
            pass
          
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # write code here for 1)
        if self.repeated_label == True:
            self.count +=1  
            # it would be clear after call til batch_size  
            self.exist_classes.append(self.labels[idx])
            self.ids_maps.append(idx)
            if self.count_batch == self.max_count - 1:
                self.count_batch = +1 
                #print("self.count_batch :",self.count_batch)
                self.count_batch = 0 

                if self.len_data % self.batch_size !=0: 
                    self.batch_size = self.len_data % self.batch_size
                    self.is_left_batch = True
                #print("change batch size !",self.batch_size)
                #print("LAST batching !")
            if self.count == self.batch_size:
                unique_labels_keys = list(set(self.exist_classes))
                table = [0] * len(unique_labels_keys)
                unique_labels = dict(zip(unique_labels_keys,table))
                if self.is_left_batch == True:
                    self.is_left_batch = False
                    self.batch_size = 16  
                else: 
                    self.count_batch += 1
                    #print("count_batch :",self.count_batch)          
                for class_key in self.exist_classes:
                    unique_labels[class_key] = +1 
                #print("tables of each labels :",unique_labels)
                for index, key  in enumerate(unique_labels):
                    if unique_labels[key] > 1:
                        print("v>1 :",unique_labels[key])
                        break
                    if index == len(unique_labels.keys()) - 1:
                        while True:
                            pos_idx = random.randint(0,self.len_data-1) 
                            if self.labels[pos_idx] in unique_labels.keys():
                                if self.labels[pos_idx] == self.labels[idx]:
                                    pass
                                else:
                                   #print("old idx :",idx,self.labels[idx])
                                    idx = pos_idx
                                   #print("new idx :",idx,self.labels[idx])
                                    unique_labels[self.labels[idx]] +=1  
                                   #print("statistics tables :",unique_labels)
                                   # replace last token
                                    self.exist_classes[-1] = self.labels[idx]
                                    if len(set(self.exist_classes)) ==  len(self.exist_classes):
                                        print("unique_labels:")
                                        #print(unique_labels)
                                    self.count = 0  
                                    self.exist_classes = [] 
                                    self.ids_maps = []
                                    break 

        label = self.labels[idx]
        data = self.text[idx]
        sample = {"Class": label,"Text": data}

        return sample

In [12]:
N = 5
data = []
labels = []
train_samples = []
train_labels = []
embed_dim = 768
batch_size = 4
lr= 1e-5  # you can adjust 
temp = 0.3  # you can adjust 
lamda = 0.01  # you can adjust  
skip_time = 0 # the number of time that yi not equal to yj in supervised contrastive loss equation 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [13]:
train_data = CustomTextDataset(train_label, train_text, batch_size = batch_size, repeated_label = True)
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)

val_data = CustomTextDataset(val_label,val_text, batch_size = batch_size, repeated_label = True)
val_loader = DataLoader(val_data, batch_size = batch_size, shuffle = True)

test_data = CustomTextDataset(test_label, test_text, batch_size = batch_size, repeated_label = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

the number of maximum of batching : 2001
the number of maximum of batching : 501
the number of maximum of batching : 770


In [14]:
num_class = len(np.unique(np.array(train_label)))
unique_label = np.unique(np.array(train_label))

label_maps = {unique_label[i]: i for i in range(len(unique_label))}

print(label_maps)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76}


In [15]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [16]:
# download config of Roberta config 
config = RobertaConfig.from_pretrained("roberta-base",output_hidden_states=True)

#chnage modifying the number of classes
config.num_labels = num_class
# Download pretrain models weight 
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
# change from binary classification to muli-classification and loss automatically change to cross entropy loss
model.num_labels = config.num_labels
# change the output of last layer to num_class that we want to predict
model.classifier.out_proj = nn.Linear(in_features=embed_dim,out_features=num_class)
# move to model to device that we set
model = model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [17]:
# Using adam optimizer 
optimizer= AdamW(model.parameters(), lr=lr)



In [18]:
# this code training models on Cross entropy loss
train_loss = []
train_accuracy = []
best_valid_loss = float('inf')
total_acc_train = 0
n_correct = 0 
n_wrong = 0


for epoch in range(1):  # loop over the dataset multiple times
    running_loss = 0.0
    
    for (idx, batch) in enumerate(train_loader):
        sentence = batch["Text"]
        inputs = tokenizer(sentence,padding=True,truncation=True,return_tensors="pt")
        
        # move parameter to device
        inputs = {k:v.to(device) for k,v in inputs.items()}

        # map string labels to class index
        labels = [label_maps[stringtoId.item()] for stringtoId in (batch['Class'])]
                    
        #print("show out: ",np.unique(labels, return_counts=True))
        # convert list to tensor
        labels = torch.tensor(labels).unsqueeze(0)
        labels = labels.to(device)
#         print(labels)

        #(batch_size, seq_len)
        #print(inputs["input_ids"].shape)

         # zero the parameter gradients
        optimizer.zero_grad()

        outputs = model(**inputs,labels=labels)
        # you can check 
        loss, logits = outputs[:2]
        
        loss.backward()
        optimizer.step()

        # write code here 
        pred_class = torch.argmax(logits)
#         print(pred_class)
        if pred_class in labels:
            n_correct += 1
        else:
            n_wrong += 1
        total_acc_train = (n_correct * 1.0) / (n_correct + n_wrong)
        
        train_loss.append(loss.item())
        train_accuracy.append(total_acc_train)
        
        # to save model eg. model.pth look at pytorch document how to save model
#         if valid_loss < best_valid_loss:
#             best_valid_loss = valid_loss
#             torch.save(model.state_dict(), 'A6-model.pt')
        
        print(f'[Epoch: {epoch + 1}, {idx}] \nTrain loss: {loss.item():.2f} \nTrain Acc: {total_acc_train*100:.2f} %')
        clear_output(wait=True)

[Epoch: 1, 2000] 
Train loss: 0.42 
Train Acc: 18.99 %


In [None]:
# this code training models on Cross entropy loss

valid_loss = []
valid_accuracy = []
total_acc_valid = 0
best_valid_loss = float('inf')
n_correct = 0 
n_wrong = 0

for epoch in range(1):# loop over the dataset multiple times
    
    running_loss = 0.0
    with torch.no_grad():
        for (idx, batch) in enumerate(val_loader):
            sentence = batch["Text"]
            inputs = tokenizer(sentence,padding=True,truncation=True,return_tensors="pt")

            # move parameter to device
            inputs = {k:v.to(device) for k,v in inputs.items()}

            # map string labels to class idex
            labels = [label_maps[stringtoId.item()] for stringtoId in (batch['Class'])]

            #print("show out: ",np.unique(labels, return_counts=True))
            # convert list to tensor
            labels = torch.tensor(labels).unsqueeze(0)
            labels = labels.to(device)


            #(batch_size, seq_len)
            #print(inputs["input_ids"].shape)

             # zero the parameter gradients
            # optimizer.zero_grad()

            outputs = model(**inputs,labels=labels)
            # you can check 
            loss, logits = outputs[:2]

            # loss.backward()
            # optimizer.step()
            pred_class = torch.argmax(logits)
#         print(pred_class)
            if pred_class in labels:
                n_correct += 1
            else:
                n_wrong += 1
            total_acc_valid = (n_correct * 1.0) / (n_correct + n_wrong)

            valid_loss.append(loss.item())
            valid_accuracy.append(total_acc_valid)

            # write code here 
            # to save model eg. model.pth look at pytorch document how to save model

#             if loss.item() < best_valid_loss:
#                 best_valid_loss = loss.item()
#                 torch.save(model.state_dict(), 'A6-model.pt')


            print(f'[Epoch: {epoch + 1}, {idx}] \nValid loss: {loss.item():.2f} \nValid Acc: {total_acc_valid*100:.2f} %')
            
            print(running_loss)  
            clear_output(wait=True)

[Epoch: 1, 499] 
Valid loss: 1.23 
Valid Acc: 24.20 %
0.0
