### citation dataset: Banking77

    @inproceedings{Casanueva2020,
        author      = {I{\~{n}}igo Casanueva and Tadas Temcinas and Daniela Gerz and Matthew Henderson and Ivan Vulic},
        title       = {Efficient Intent Detection with Dual Sentence Encoders},
        year        = {2020},
        month       = {mar},
        note        = {Data available at https://github.com/PolyAI-LDN/task-specific-datasets},
        url         = {https://arxiv.org/abs/2003.04807},
        booktitle   = {Proceedings of the 2nd Workshop on NLP for ConvAI - ACL 2020}
    }

In [None]:
!pip install transformers



In [None]:
!pip install datasets



In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForSequenceClassification, RobertaForQuestionAnswering
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AdamW
import random
from IPython.display import clear_output
from utils import create_supervised_pair, supervised_contrasive_loss, Similarity

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('banking77')

Downloading builder script:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset banking77/default (download: 1.03 MiB, generated: 897.51 KiB, post-processed: Unknown size, total: 1.91 MiB) to /root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b...


Downloading data:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]

Dataset banking77 downloaded and prepared to /root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})


In [None]:
train_dataset = dataset["train"]
print(train_dataset)
print(len(train_dataset))

Dataset({
    features: ['text', 'label'],
    num_rows: 10003
})
10003


#### Split Train and validation 

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_dataset, train_size = 0.8)

In [None]:
train_text = train["text"]
train_label = train["label"]
print(train_text[:3])
print(sorted(set(train_label)))

['Physical cards are what I need more of', 'Weird Direct Debit payment', 'I was able to find my card. How to I go about putting it into my app?']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


In [None]:
val_text = val["text"]
val_label = val["label"]
print(val_text[:3])
print(sorted(set(val_label)))

['I would like to know where my source of funds came from.', 'Need a new passcode.', 'How do I verify my identity online']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


In [None]:
test_dataset = dataset["test"]
print(train_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 10003
})


In [None]:
test_text = test_dataset["text"]
test_label = test_dataset["label"]
print(test_text[:3])
print(sorted(set(test_label)))

['How do I locate my card?', 'I still have not received my new card, I ordered over a week ago.', 'I ordered a card but it has not arrived. Help please!']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]


#### Dataloader

In [None]:
class CustomTextDataset(Dataset):
    def __init__(self,labels,text,batch_size,repeated_label:bool=False):
        self.labels = labels
        self.text = text
        self.batch_size = batch_size 
        self.count = 0 
        self.repeated_label = repeated_label        
        # to use when training with supervise contrastive loss
        if self.repeated_label == True:
           # write the code here
            self.exist_classes = [] 
            self.label_maps = None 
            self.ids_maps = []
            self.len_data = len(self.labels)
            self.count_batch = 0 
            self.is_left_batch = False
            #print("self.len_data ",self.len_data)
            #print("self.len data",self.batch_size)
            self.max_count = self.len_data // self.batch_size 
            if self.len_data % self.batch_size !=0:
                self.max_count += 1 
            print("the number of maximum of batching :",self.max_count)
            pass
          
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # write code here for 1)
        if self.repeated_label == True:
            self.count +=1  
            # it would be clear after call til batch_size  
            self.exist_classes.append(self.labels[idx])
            self.ids_maps.append(idx)
            if self.count_batch == self.max_count - 1:
                self.count_batch = +1 
                #print("self.count_batch :",self.count_batch)
                self.count_batch = 0 

                if self.len_data % self.batch_size !=0: 
                    self.batch_size = self.len_data % self.batch_size
                    self.is_left_batch = True
                #print("change batch size !",self.batch_size)
                #print("LAST batching !")
            if self.count == self.batch_size:
                unique_labels_keys = list(set(self.exist_classes))
                table = [0] * len(unique_labels_keys)
                unique_labels = dict(zip(unique_labels_keys,table))
                if self.is_left_batch == True:
                    self.is_left_batch = False
                    self.batch_size = 16  
                else: 
                    self.count_batch += 1
                    #print("count_batch :",self.count_batch)          
                for class_key in self.exist_classes:
                    unique_labels[class_key] = +1 
                #print("tables of each labels :",unique_labels)
                for index, key  in enumerate(unique_labels):
                    if unique_labels[key] > 1:
                        print("v>1 :",unique_labels[key])
                        break
                    if index == len(unique_labels.keys()) - 1:
                        while True:
                            pos_idx = random.randint(0,self.len_data-1) 
                            if self.labels[pos_idx] in unique_labels.keys():
                                if self.labels[pos_idx] == self.labels[idx]:
                                    pass
                                else:
                                   #print("old idx :",idx,self.labels[idx])
                                    idx = pos_idx
                                   #print("new idx :",idx,self.labels[idx])
                                    unique_labels[self.labels[idx]] +=1  
                                   #print("statistics tables :",unique_labels)
                                   # replace last token
                                    self.exist_classes[-1] = self.labels[idx]
                                    if len(set(self.exist_classes)) ==  len(self.exist_classes):
                                        print("unique_labels:")
                                        #print(unique_labels)
                                    self.count = 0  
                                    self.exist_classes = [] 
                                    self.ids_maps = []
                                    break 

        label = self.labels[idx]
        data = self.text[idx]
        sample = {"Class": label,"Text": data}

        return sample

In [None]:
N = 5
data = []
labels = []
train_samples = []
train_labels = []
embed_dim = 768
batch_size = 4
lr= 1e-5  # you can adjust 
temp = 0.3  # you can adjust 
lamda = 0.01  # you can adjust  
skip_time = 0 # the number of time that yi not equal to yj in supervised contrastive loss equation 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
train_data = CustomTextDataset(train_label, train_text, batch_size = batch_size, repeated_label = True)
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)

val_data = CustomTextDataset(val_label,val_text, batch_size = batch_size, repeated_label = True)
val_loader = DataLoader(val_data, batch_size = batch_size, shuffle = True)

test_data = CustomTextDataset(test_label, test_text, batch_size = batch_size, repeated_label = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

the number of maximum of batching : 2001
the number of maximum of batching : 501
the number of maximum of batching : 770


In [None]:
num_class = len(np.unique(np.array(train_label)))
unique_label = np.unique(np.array(train_label))

label_maps = {unique_label[i]: i for i in range(len(unique_label))}

print(label_maps)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76}


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2')
# tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# model = AutoModelForQuestionAnswering.from_pretrained('roberta-base')


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [None]:
# download config of Roberta config 
config = RobertaConfig.from_pretrained("deepset/roberta-base-squad2",output_hidden_states=True)

#chnage modifying the number of classes
config.num_labels = num_class
# Download pretrain models weight 
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
# change from binary classification to muli-classification and loss automatically change to cross entropy loss
model.num_labels = config.num_labels
# change the output of last layer to num_class that we want to predict
# model.classifier.out_proj = nn.Linear(in_features=embed_dim,out_features=num_class)
# move to model to device that we set
model = model.to(device)

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

In [None]:
# from transformers import RobertaTokenizer, RobertaForQuestionAnswering
# import torch

# tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# model = RobertaForQuestionAnswering.from_pretrained("roberta-base")

# question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
# inputs = tokenizer(question, text, return_tensors="pt")
# start_positions = torch.tensor([1])
# end_positions = torch.tensor([3])

# outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
# loss = outputs.loss
# start_scores = outputs.start_logits
# end_scores = outputs.end_logits

In [None]:
model.base_model.config

RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [None]:
# Using adam optimizer 
optimizer= AdamW(model.parameters(), lr=lr)



In [None]:
# this code training models on Cross entropy loss
from torch import autograd 
train_loss = []
train_accuracy = []
best_valid_loss = float('inf')
total_acc_train = 0
n_correct = 0 
n_wrong = 0


for epoch in range(1):  # loop over the dataset multiple times
    running_loss = 0.0
    
    for (idx, batch) in enumerate(train_loader):
        sentence = batch["Text"]
        inputs = tokenizer(sentence,padding=True,truncation=True,return_tensors="pt")
        
        # move parameter to device
        inputs = {k:v.to(device) for k,v in inputs.items()}

        # map string labels to class index
        labels = [label_maps[stringtoId.item()] for stringtoId in (batch['Class'])]
                    
        #print("show out: ",np.unique(labels, return_counts=True))
        # convert list to tensor
        labels = torch.tensor(labels).unsqueeze(0)
        labels = labels.to(device)
#         print(labels)

        #(batch_size, seq_len)
        #print(inputs["input_ids"].shape)

         # zero the parameter gradients
        optimizer.zero_grad()
        with autograd.set_grad_enabled(True):
          outputs = model(**inputs)
        # print(outputs)
        # you can check 
        S, logits = outputs[:2]
        loss = torch.sum(S)
        loss.backward()
        optimizer.step()

        # write code here 
        pred_class = torch.argmax(logits)
#         print(pred_class)
        if pred_class in labels:
            n_correct += 1
        else:
            n_wrong += 1
        total_acc_train = (n_correct * 1.0) / (n_correct + n_wrong)
        
        train_loss.append(loss.item())
        train_accuracy.append(total_acc_train)
        
        # to save model eg. model.pth look at pytorch document how to save model
#         if valid_loss < best_valid_loss:
#             best_valid_loss = valid_loss
#             torch.save(model.state_dict(), 'A6-model.pt')
        
        print(f'[Epoch: {epoch + 1}, {idx}] \nTrain loss: {loss.item():.2f} \nTrain Acc: {total_acc_train*100:.2f} %')
        clear_output(wait=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Train loss: -3115.86 
Train Acc: 3.28 %
[Epoch: 1, 335] 
Train loss: -1443.06 
Train Acc: 3.27 %
[Epoch: 1, 336] 
Train loss: -981.73 
Train Acc: 3.26 %
[Epoch: 1, 337] 
Train loss: -924.45 
Train Acc: 3.25 %
[Epoch: 1, 338] 
Train loss: -1040.36 
Train Acc: 3.24 %
[Epoch: 1, 339] 
Train loss: -867.22 
Train Acc: 3.24 %
[Epoch: 1, 340] 
Train loss: -983.31 
Train Acc: 3.23 %
[Epoch: 1, 341] 
Train loss: -925.82 
Train Acc: 3.22 %
[Epoch: 1, 342] 
Train loss: -2315.34 
Train Acc: 3.21 %
[Epoch: 1, 343] 
Train loss: -2953.46 
Train Acc: 3.20 %
[Epoch: 1, 344] 
Train loss: -1216.68 
Train Acc: 3.19 %
[Epoch: 1, 345] 
Train loss: -869.28 
Train Acc: 3.18 %
[Epoch: 1, 346] 
Train loss: -1855.40 
Train Acc: 3.17 %
[Epoch: 1, 347] 
Train loss: -1972.14 
Train Acc: 3.16 %
[Epoch: 1, 348] 
Train loss: -2147.11 
Train Acc: 3.15 %
[Epoch: 1, 349] 
Train loss: -1335.09 
Train Acc: 3.14 %
[Epoch: 1, 350] 
Train loss: -1277.44 
Train A

In [None]:
# this code training models on Cross entropy loss

valid_loss = []
valid_accuracy = []
total_acc_valid = 0
best_valid_loss = float('inf')
n_correct = 0 
n_wrong = 0

for epoch in range(1):# loop over the dataset multiple times
    
    running_loss = 0.0
    for (idx, batch) in enumerate(val_loader):
            sentence = batch["Text"]
            inputs = tokenizer(sentence,padding=True,truncation=True,return_tensors="pt")

            # move parameter to device
            inputs = {k:v.to(device) for k,v in inputs.items()}

            # map string labels to class idex
            labels = [label_maps[stringtoId.item()] for stringtoId in (batch['Class'])]

            #print("show out: ",np.unique(labels, return_counts=True))
            # convert list to tensor
            labels = torch.tensor(labels).unsqueeze(0)
            labels = labels.to(device)


            #(batch_size, seq_len)
            #print(inputs["input_ids"].shape)

             # zero the parameter gradients
            optimizer.zero_grad()
            with autograd.set_grad_enabled(True):
              outputs = model(**inputs)
            # you can check 

            S, logits = outputs[:2]
            loss = torch.sum(S)
            loss.backward()
            optimizer.step()
            pred_class = torch.argmax(logits)
#         print(pred_class)
            if pred_class in labels:
                n_correct += 1
            else:
                n_wrong += 1
            total_acc_valid = (n_correct * 1.0) / (n_correct + n_wrong)

            valid_loss.append(loss.item())
            valid_accuracy.append(total_acc_valid)

            # write code here 
            # to save model eg. model.pth look at pytorch document how to save model

#             if loss.item() < best_valid_loss:
#                 best_valid_loss = loss.item()
#                 torch.save(model.state_dict(), 'A6-model.pt')


            print(f'[Epoch: {epoch + 1}, {idx}] \nValid loss: {loss.item():.2f} \nValid Acc: {total_acc_valid*100:.2f} %')
            
            print(running_loss)  
            clear_output(wait=True)

In [None]:
text = r"""The ATM keeps rejecting my Payment. I tried two different ATMs but did not work .Can you please check if everything is okay with my account?, Can you lookup my password?, Why is there an unknown card payment?
"""
# Japan is the eleventh-most populous country in the world, as well as one of the most densely populated and urbanized.
#  About three-fourths of the country's terrain is mountainous, concentrating its population of 125.57 million on narrow coastal plains. 
#  Japan is divided into 47 administrative prefectures and eight traditional regions.
#  Osaka has a big population of 16 million. 
#  The Greater Tokyo Area is the most populous metropolitan area in the world, with more than 37.4 million residents. 
# 'The ATM keeps rejecting my Payment. I tried two different ATMs but did not work .Can you please check if everything is okay with my account?', 'Can you lookup my password?', 'Why is there an unknown card payment?'

In [None]:
# import numpy as np
# def get_top_answers(possible_starts,possible_ends,input_ids):
#     answers = []
#     for start,end in zip(possible_starts,possible_ends):
#     #+1 for end
#         answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[start:end+1]))
#         answers.append( answer )
#     return answers  

# def answer_question(question,context,topN):

#     inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt").to(device)
    
#     input_ids = inputs["input_ids"].tolist()[0]

#     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
#     model_out = model(**inputs)
    
#     answer_start_scores = model_out['start_logits']
#     answer_end_scores = model_out['end_logits']

#     possible_starts = np.argsort(answer_start_scores.cpu().detach().numpy()).flatten()[::-1][:topN]
#     possible_ends = np.argsort(answer_end_scores.cpu().detach().numpy()).flatten()[::-1][:topN]
    
#     #get best answer
#     answer_start = torch.argmax(answer_start_scores)  
#     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

#     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

#     answers = get_top_answers(possible_starts,possible_ends,input_ids )  #to extract only top related answers

#     return { "answer":answer,"answer_start":answer_start,"answer_end":answer_end,"input_ids":input_ids,
#             "answer_start_scores":answer_start_scores,"answer_end_scores":answer_end_scores,"inputs":inputs,"answers":answers,
#             "possible_starts":possible_starts,"possible_ends":possible_ends}

In [None]:
# questions = [
#     "Can you please check if everything is okay with my account?",
#     "What do I do if my phone is stolen?",
#     "How do I speed up identity verification?",
#     "What is the topic here?",
#     "What are we talking about?",
#     "What is the main idea here?"
# ]

In [None]:


# for q in questions:
#     answer_map = answer_question(q,text,5)    
#     print("Question:",q)
#     print("Answers:")
#     [print((index+1)," ) ",ans) for index,ans in  enumerate(answer_map["answers"]) if len(ans) > 0 ]

  

In [None]:
# text = r"""I am still waiting on my card?, What can I do if my card still hasn't arrived after 2 weeks?, I have been waiting over a week. Is the card still coming?, Can I track my card while it is in the process of delivery?"""