### citation dataset: Banking77

    @inproceedings{Casanueva2020,
        author      = {I{\~{n}}igo Casanueva and Tadas Temcinas and Daniela Gerz and Matthew Henderson and Ivan Vulic},
        title       = {Efficient Intent Detection with Dual Sentence Encoders},
        year        = {2020},
        month       = {mar},
        note        = {Data available at https://github.com/PolyAI-LDN/task-specific-datasets},
        url         = {https://arxiv.org/abs/2003.04807},
        booktitle   = {Proceedings of the 2nd Workshop on NLP for ConvAI - ACL 2020}
    }

In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW
import random
from IPython.display import clear_output
from utils import create_supervised_pair, supervised_contrasive_loss, Similarity

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset('banking77')

Using custom data configuration default
Reusing dataset banking77 (C:\Users\minkh\.cache\huggingface\datasets\banking77\default\1.1.0\aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})


In [5]:
train_dataset = dataset["train"]
print(train_dataset)
print(len(train_dataset))

Dataset({
    features: ['text', 'label'],
    num_rows: 10003
})
10003


#### Split Train and validation 

In [7]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_dataset, train_size = 0.8)

In [8]:
train_text = train["text"]
train_label = train["label"]
print(train_text[:3])
print(sorted(set(train_label)))

["I couldn't make a transfer because it was declined", 'I have checked my statements multiple times, but my refund that I requested a while back is not showing on my account. Is there a reason for this? Can I get some help getting my money back from this transaction?', 'It seems my card payment was completed twice. I paid at the store previously and the first one did not seem to go through. After a second attempt, it did go through. I now see within the app that I have been charged twice with one of them pending. Can you please remove the pending amount because it is clearly something that was declined?']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


In [9]:
val_text = val["text"]
val_label = val["label"]
print(val_text[:3])
print(sorted(set(val_label)))

['I was not able to use the ATM to get cash', 'Do you support fiat currencies?', 'How do I get a PIN?']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


In [10]:
test_dataset = dataset["test"]
print(train_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 10003
})


In [11]:
test_text = test_dataset["text"]
test_label = test_dataset["label"]
print(test_text[:3])
print(sorted(set(test_label)))

['How do I locate my card?', 'I still have not received my new card, I ordered over a week ago.', 'I ordered a card but it has not arrived. Help please!']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


#### Dataloader

In [14]:
class CustomTextDataset(Dataset):
    def __init__(self,labels,text,batch_size,repeated_label:bool=False):
        self.labels = labels
        self.text = text
        self.batch_size = batch_size 
        self.count = 0 
        self.repeated_label = repeated_label        
        # to use when training with supervise contrastive loss
        if self.repeated_label == True:
           # write the code here
            self.exist_classes = [] 
            self.label_maps = None 
            self.ids_maps = []
            self.len_data = len(self.labels)
            self.count_batch = 0 
            self.is_left_batch = False
            #print("self.len_data ",self.len_data)
            #print("self.len data",self.batch_size)
            self.max_count = self.len_data // self.batch_size 
            if self.len_data % self.batch_size !=0:
                self.max_count += 1 
            print("the number of maximum of batching :",self.max_count)
            pass
          
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # write code here for 1)
        if self.repeated_label == True:
            self.count +=1  
            # it would be clear after call til batch_size  
            self.exist_classes.append(self.labels[idx])
            self.ids_maps.append(idx)
            if self.count_batch == self.max_count - 1:
                self.count_batch = +1 
                #print("self.count_batch :",self.count_batch)
                self.count_batch = 0 

                if self.len_data % self.batch_size !=0: 
                    self.batch_size = self.len_data % self.batch_size
                    self.is_left_batch = True
                #print("change batch size !",self.batch_size)
                #print("LAST batching !")
            if self.count == self.batch_size:
                unique_labels_keys = list(set(self.exist_classes))
                table = [0] * len(unique_labels_keys)
                unique_labels = dict(zip(unique_labels_keys,table))
                if self.is_left_batch == True:
                    self.is_left_batch = False
                    self.batch_size = 16  
                else: 
                    self.count_batch += 1
                    #print("count_batch :",self.count_batch)          
                for class_key in self.exist_classes:
                    unique_labels[class_key] = +1 
                #print("tables of each labels :",unique_labels)
                for index, key  in enumerate(unique_labels):
                    if unique_labels[key] > 1:
                        print("v>1 :",unique_labels[key])
                        break
                    if index == len(unique_labels.keys()) - 1:
                        while True:
                            pos_idx = random.randint(0,self.len_data-1) 
                            if self.labels[pos_idx] in unique_labels.keys():
                                if self.labels[pos_idx] == self.labels[idx]:
                                    pass
                                else:
                                   #print("old idx :",idx,self.labels[idx])
                                    idx = pos_idx
                                   #print("new idx :",idx,self.labels[idx])
                                    unique_labels[self.labels[idx]] +=1  
                                   #print("statistics tables :",unique_labels)
                                   # replace last token
                                    self.exist_classes[-1] = self.labels[idx]
                                    if len(set(self.exist_classes)) ==  len(self.exist_classes):
                                        print("unique_labels:")
                                        #print(unique_labels)
                                    self.count = 0  
                                    self.exist_classes = [] 
                                    self.ids_maps = []
                                    break 

        label = self.labels[idx]
        data = self.text[idx]
        sample = {"Class": label,"Text": data}

        return sample

In [12]:
batch_size = 4

In [15]:
train_data = CustomTextDataset(train_label, train_text, batch_size = batch_size, repeated_label = True)
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)

val_data = CustomTextDataset(val_label,val_text, batch_size = batch_size, repeated_label = True)
val_loader = DataLoader(val_data, batch_size = batch_size, shuffle = True)

test_data = CustomTextDataset(test_label, test_text, batch_size = batch_size, repeated_label = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

the number of maximum of batching : 2001
the number of maximum of batching : 501
the number of maximum of batching : 770
