# **Installing the dependencies**

In [1]:
!pip install transformers torch

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
!pip install -U SentencePiece

Collecting SentencePiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.99


In [3]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/258.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [4]:
CUDA_LAUNCH_BLOCKING=1 # to stop cuda blocking

# **Connecting google colab to drive**

In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# **Importing the dependencies**

In [6]:
import json
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, T5Tokenizer, T5ForSequenceClassification,TrainingArguments, Trainer

## Data format

2 json files - `surprise.data` contains utterances, `surprise.solution` contains corresponding intents

Format of `surprise.data`

```json
{"indoml_id": "surprise|11109", "id": "11109", "utt": "Can I make a reservation at Buffalo Wild Wings?"}
{"indoml_id": "surprise|11051", "id": "11051", "utt": "Can I book a table for tonight at Bella Vita?"}
....
```

Format of `surprise.solution` (contains 150 labels)

```json
{"indoml_id": "surprise|11109", "intent": "accept reservations"}
{"indoml_id": "surprise|11051", "intent": "accept reservations"}
....
```


# **Loading the data and solution of the surprise dataset**

In [14]:
# Load the data from the JSON files
with open('/content/drive/MyDrive/intent_classification_It_bombay/surprise_data/surprise.data', 'r') as data_file:
    data = [json.loads(line) for line in data_file] # converting string to object using json.loads

with open('/content/drive/MyDrive/intent_classification_It_bombay/surprise_data/surprise.solution', 'r') as solution_file:
    solutions = [json.loads(line) for line in solution_file] # converting string to object using json.loads

# with open('/content/drive/MyDrive/Intent_classification_IIT_BOMBAY/Dataset/massive_test_phase2_data/massive_test.data', 'r') as test_file:
#     test = [json.loads(line) for line in test_file]



In [15]:
print(data[0]) # data of the surpirse dataset having id and the utterances

{'indoml_id': 'surprise|11109', 'id': '11109', 'utt': 'Can I make a reservation at Buffalo Wild Wings?'}


In [16]:

print(solutions[0]) # Solutions of the surpise dataset having id and intent for the previous teerances

{'indoml_id': 'surprise|11109', 'intent': 'accept reservations'}


In [17]:
# Create a dictionary to map indoml_id to intents
intent_map = {item['indoml_id']: item['intent'] for item in solutions}

# Split data into train and test sets (2:1) stratified by intent
indoml_ids = [item['indoml_id'] for item in data]
intents = [intent_map[indoml_id] for indoml_id in indoml_ids]
"""utterances(features) for the dataset"""
utt = [item['utt'] for item in data]

num_classes = len(set(intents))
print("Number of classes")
print(num_classes)

# """Splitting the dataset into train and test set"""
# train_data, test_data, train_labels, test_labels = train_test_split(
#     utt, intents, test_size=0.25, random_state=42, stratify=intents
# )

Number of classes
150


In [18]:
intents
intents_set=set(intents) # basically represent the total no. of disntics intents(labels) in our surprise dataset(150)
print(intents_set)
len(intents_set)

{'international visa', 'tire change', 'application status', 'apr', 'tell joke', 'gas', 'rewards balance', 'account blocked', 'travel notification', 'flight status', 'current location', 'pto request status', 'credit limit', 'reminder update', 'change language', 'transfer', 'change volume', 'tire pressure', 'text', 'sync device', 'calendar update', 'roll dice', 'timer', 'plug type', 'food last', 'make call', 'meeting schedule', 'calculator', 'recipe', 'card declined', 'time', 'freeze account', 'change accent', 'rollover 401k', 'thank you', 'smart home', 'spending history', 'change user name', 'who made you', 'last maintenance', 'book flight', 'report lost card', 'restaurant reviews', 'date', 'calendar', 'schedule maintenance', 'change speed', 'user name', 'traffic', 'expiration date', 'nutrition info', 'min payment', 'reminder', 'restaurant suggestion', 'ingredient substitution', 'next holiday', 'pto request', 'ingredients list', 'do you have pets', 'what can i ask you', 'exchange rate',

150

In [19]:
labels_list=[]
for label in solutions:
    labels_list.append(label['intent'])
unique_labels_list=[]
for x in labels_list:
    if x not in unique_labels_list:
        unique_labels_list.append(x)
# unique_labels_list, len(unique_labels_list)

label2id={}
id2label={}
for i, label in enumerate(unique_labels_list):
    label2id[label]=i
    id2label[i]=label

In [20]:
intents_integar = [label2id[label] for label in intents]


In [22]:
len(intents_integar), intents_integar[0:5],intents_integar[-5:]

(2248, [0, 0, 0, 0, 0], [149, 149, 149, 149, 149])

In [None]:
# utt

#**Applying kfold cross validation on the our dataset**

In [None]:
"""we have utt has our features and labels(intent) converted into integar using label2id is our labels.
then we have converted our features(intent) or X into numpy array and also converted intents_integar into numpy array because we c
can apply kfold  only on the numpy arrays.

After then we have made 4 nump arrays train_data_kfold,train_labels_fold and other same 2 for test set splitted from training set we have taken 5 fold,
it means our fior every fold our training dataset will be first 80% dataset splitting randomly  by applying shuffle=True during kfold and remaining 20% of the dataset will be test set/

Then for 2nd(another) fold we have another 80% of the total dataset will bve training dataset and remaining 30% will be test set."""
# import numpy as np
# from sklearn.model_selection import KFold

# X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])

# y = np.array([1, 2, 3, 4])
# kf = KFold(n_splits=2)

# for train_index, test_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

# X_train, y_train, X_test, y_test

import numpy as np
from sklearn.model_selection import KFold
train_data_kfold=[]
test_data_kfold=[]
train_labels_kfold=[]
test_labels_kfold=[]
X=np.array(utt)
y=np.array(intents_integar)
# y = intents_integar
kf = KFold(n_splits=5, shuffle=True)
print(kf)
print(type(kf.split(X)))
print("**")
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_data, test_data = X[train_index], X[test_index]
    train_data_kfold.append(train_data)
    test_data_kfold.append(test_data)
    print('**')
    train_labels, test_labels = y[train_index], y[test_index]
    train_labels_kfold.append(train_labels)
    test_labels_kfold.append(test_labels)



KFold(n_splits=5, random_state=None, shuffle=True)
<class 'generator'>
**
TRAIN: [   0    2    3 ... 2244 2245 2247] TEST: [   1    6    9   11   17   23   28   29   33   35   42   45   52   67
   72   73   90   93   94   95  108  112  129  134  140  146  148  160
  165  169  182  185  186  192  193  195  203  207  208  212  216  230
  237  241  259  261  269  283  285  295  302  309  313  314  315  322
  326  328  340  353  354  359  360  361  363  364  365  367  379  383
  384  390  400  409  411  413  425  431  435  436  444  446  450  452
  454  458  459  467  469  475  484  490  507  509  517  518  521  525
  527  536  540  554  565  572  576  579  586  587  589  594  597  604
  609  612  614  621  622  623  634  640  641  648  654  656  657  661
  665  667  685  687  694  697  698  699  700  703  709  710  717  726
  730  736  740  743  744  748  750  754  757  760  761  771  774  775
  779  783  787  788  795  799  806  818  823  837  843  846  854  856
  862  864  865  866  871

In [None]:
train_data_kfold, train_labels_kfold

([array(['Can I make a reservation at Buffalo Wild Wings?',
         'Does the Cheesecake Factory take reservations?',
         'Do they take reservations at Outback Steakhouse in Las Vegas?',
         ..., "Oh, without a doubt, that's absolutely true",
         "Absolutely, I couldn't agree more!", "That's a superb solution!"],
        dtype='<U226'),
  array(['Can I make a reservation at Buffalo Wild Wings?',
         'Can I book a table for tonight at Bella Vita?',
         'Does the Cheesecake Factory take reservations?', ...,
         "Absolutely, I couldn't agree more!", "It's a wonderful notion!",
         "That's a superb solution!"], dtype='<U226'),
  array(['Can I book a table for tonight at Bella Vita?',
         'Does the Cheesecake Factory take reservations?',
         'Do they take reservations at Outback Steakhouse in Las Vegas?',
         ..., "Absolutely, I couldn't agree more!",
         "It's a wonderful notion!", "That's a superb solution!"],
        dtype='<U226'),

In [None]:
test_data_kfold, test_labels_kfold

([array(['Can I book a table for tonight at Bella Vita?',
         'Can I check availability and make a reservation online?',
         'Does Buffalo Wild Wings take reservations for sports games',
         'Does Olive Garden in San Francisco take reservations?',
         'what caused the sudden suspension of my business account without prior notice?',
         "Can't believe I got locked out of my own account without any notice!",
         'Can someone explain why my investment account has been frozen?',
         "I can't seem to access my funds due to an unexpected restriction on my account.",
         'Set an alarm on your watch for 12 pm next week.',
         "Set two alarms for me, one at 8 am and another at 12 pm, so I don't miss my important deadlines today.",
         'Set two alarms for 7 am and 12 pm so I can have enough time to prepare for work and lunch respectively.',
         'Could you please tell me whether my visa application has been approved or denied?',
         'I w

#**label encoding- converting labels to numeric values**

In [None]:
# from sklearn.preprocessing import LabelEncoder
# label_encoder=LabelEncoder()
# intents_int=label_encoder.fit_transform(intents)
# intents_int, intents_int.shape

In [None]:
# intents
# intents_set=set(intents) # basically represent the total no. of disntics intents(labels) in our surprise dataset(150)
# print(intents_set)
# len(intents_set)

In [None]:
# id2label={} # id2label mapping
# label2id={} # label2id mapping
# for i, intent in enumerate(intents_set):
#   id2label[i] = intent
#   label2id[intent]=i



In [None]:
print(list(id2label.items())[:5])
print('\n')
print(list(label2id.items())[:5])


[(0, 'meeting schedule'), (1, 'rollover 401k'), (2, 'user name'), (3, 'what can i ask you'), (4, 'bill due')]


[('meeting schedule', 0), ('rollover 401k', 1), ('user name', 2), ('what can i ask you', 3), ('bill due', 4)]


# **Loading the pretrained intent classification Alexa XLMRoberta model and the tokenizer**

In [None]:
# Define the BERT model and tokenizer
# keep `ignore_mismatched_sizes=True` so that the classification layer is randomly initialized
# model_name = "cartesinus/bert-base-uncased-amazon-massive-intent"  # Example: You can replace this with the specific RoBERTa variant you want to use
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification'
# model_name="philschmid/habana-xlm-r-large-amazon-massive" # 91% accuracy on Intent classification base amazon massive
model_name="ibm/roberta-large-vira-intents" # by the IBM on 180 labels
# model_name='neurae/bert-dnd-intents' # trained on 15 intent with achieving an accuracy of 98%
# Initialize the tokenizer
# tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
tokenizer=RobertaTokenizer.from_pretrained(model_name)
# tokenizer=BertTokenizer.from_pretrained(model_name)

In [None]:
# Initialize the model
num_classes=150
# model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, ignore_mismatched_sizes=True) # ,output_hidden_states=True
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, ignore_mismatched_sizes=True)
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, ignore_mismatched_sizes=True)

"""since model have its own id2label mapping and vice versa so, converting them from our id2label and label2id mapping as defined earlier"""
model.label2id=label2id
model.id2label=id2label


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ibm/roberta-large-vira-intents and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([181, 1024]) in the checkpoint and torch.Size([150, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([181]) in the checkpoint and torch.Size([150]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
batch_size = 32  # Adjust the batch size as needed
# Create PyTorch datasets
class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_data_kfold[1]

array(['Can I make a reservation at Buffalo Wild Wings?',
       'Can I book a table for tonight at Bella Vita?',
       'Does the Cheesecake Factory take reservations?', ...,
       "Absolutely, I couldn't agree more!", "It's a wonderful notion!",
       "That's a superb solution!"], dtype='<U226')

In [None]:
train_labels_kfold[1]

array([101, 101, 101, ..., 121, 121, 121])

# **making pytorch datasets for all the foldes by iteraing in range of 5**

# **Also applying dataloader for all the folds**

In [None]:
train_dataloader_kfold=[]
test_dataloader_kfold=[]
for i in range(5):
  # Tokenize the input training data
  train_encodings = tokenizer(
      list(train_data_kfold[i]),
      truncation=True,
      padding=True,
      max_length=54,
      return_tensors='pt'    #return type is pytorch tensor
  )
  #Tokenizing the input testing data
  test_encodings = tokenizer(
      list(test_data_kfold[i]),
      truncation=True,
      padding=True,
      max_length=54,
      return_tensors='pt'
  )



  train_dataset = IntentDataset(train_encodings, train_labels_kfold[i])
  test_dataset = IntentDataset(test_encodings, test_labels_kfold[i])


  train_dataloader = DataLoader(
      train_dataset,
      batch_size=batch_size,
      shuffle=True  # You can shuffle your data for randomness during training
  )
  test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False  # You can shuffle your data for randomness during training
    )
  train_dataloader_kfold.append(train_dataloader)
  test_dataloader_kfold.append(test_dataloader)

In [None]:
train_dataloader_kfold

[<torch.utils.data.dataloader.DataLoader at 0x7bfb1c77ea40>,
 <torch.utils.data.dataloader.DataLoader at 0x7bfb1c07c790>,
 <torch.utils.data.dataloader.DataLoader at 0x7bfb19fbad70>,
 <torch.utils.data.dataloader.DataLoader at 0x7bfb1c07d540>,
 <torch.utils.data.dataloader.DataLoader at 0x7bfb19fbaa10>]

In [None]:
# """Convert labels to numeric values from label2id mapping"""
# train_int_labels = [label2id[label] for label in train_labels]
# test_int_labels = [label2id[label] for label in test_labels]
# test_int_labels[0:5]

# **initializing training arguments and training the model**

In [None]:
# # Create a trainer and train the model
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
# )

# trainer.train()

# **Traning the model by iterating over training dataloader**

*Here we will trainn the model not using Training Arguments and Trainer because we have to do some changes in our training.*

**Here we will train the model by iterting over training dataloader and calculating the training loss at the same time and at the same time we will evaluate the same model using model.eval and also calculating the validation loss at the same time**

In [None]:
# Training arguments
# previous_saved_dir='./intent_classification__'
saved_dir='/content/drive/MyDrive/intent_classification_It_bombay/trained_model_kfold_checkpoints/'
training_args = TrainingArguments(
    output_dir=saved_dir,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    seed=42,
    # weight-decay=0.01
    # gradient_accumulation_steps=40,
    evaluation_strategy="steps", # use 'epoch' for evaluating every epoch
    logging_steps=10,
    eval_steps=10,
    save_total_limit=5,
    save_steps=15,
    learning_rate=7e-6,
    # warmup_steps=400,
    # weight_decay=0.10,
    # adam_epsilon=1e-7,
    # warmup_steps=400,
    num_train_epochs=2,
    logging_dir='./logs',
)

In [None]:
import os
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import TrainingArguments
from tqdm import tqdm

# Define your model, tokenizer, and other necessary components here
# Make sure your model is moved to the GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the learning rate as a float (e.g., 7e-6)
learning_rate = 7e-6

# Define the number of training epochs for each fold as an integer (e.g., 8)
num_epochs_per_fold = 8

# # Define the number of training epochs as an integer (e.g., 40)
# num_train_epochs = 40

# Create the optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# # Define the number of training steps (you may need to adjust this based on your dataset)
# Calculate the total number of training steps
total_train_steps = len(train_dataloader_kfold[0]) * num_epochs_per_fold

# Define the number of warmup steps (e.g., 10% of the total training steps)
num_warmup_steps = int(0.1 * total_train_steps)

# Create a learning rate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_train_steps,
)

# Define the output directory for saving the model
saved_dir = '/content/drive/MyDrive/intent_classification_It_bombay/trained_model_10_checkpoints/'

# Training loop with tqdm progress bar
"looping over epochs and fold"
for epoch in range(num_epochs_per_fold):
  for k in range(5):
    train_data_loader=train_dataloader_kfold[k]
    for i, batch in enumerate(train_data_loader):
      model.train()
      total_loss=0.0
      # progress_bar=tqdm(enumerate(train_data), total=len(train_data), desc=f"Epoch {epoch} - Fold {fold_idx}")
      # for step, batch in progress_bar:
        # move data to GPU
      batch={k:v.to(device) for k, v in batch.items()}

      # forward pass
      outputs=model(**batch)
      loss=outputs.loss

      # backward pass and optimization
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      # update the learning rate
      lr_scheduler.step()

      # calculate the total loss
      total_loss+=loss.item()
      if i%20==0:
        print(f"training loss at epoch {epoch} batch {i}, fold{k} is : {total_loss}")
    print(f"at the end training loss at epoch {epoch} batch {i}, fold{k} is : {total_loss}")



  # Save the model checkpoint at the end of each epoch
    checkpoint_dir = os.path.join(saved_dir, f"epoch_{epoch}_fold_{k}")
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Save model weights
    model.save_pretrained(checkpoint_dir)

    # Save optimizer state
    # torch.save(optimizer.state_dict(), os.path.join(checkpoint_dir, "optimizer.pt"))

    # Save learning rate scheduler state
    # torch.save(lr_scheduler.state_dict(), os.path.join(checkpoint_dir, "scheduler.pt"))

    # Save RNG state (if needed)
    torch.save(torch.get_rng_state(), os.path.join(checkpoint_dir, "rng_state.pth"))

    # Save config.json (if needed)
    model.config.save_pretrained(checkpoint_dir)

    # Save tokenizer (if needed)
    tokenizer.save_pretrained(checkpoint_dir)

      # # Save the model checkpoint every epoch or after a specific number of steps
      # if epoch % training_args.save_steps == 0:
      #     checkpoint_dir = os.path.join(saved_dir, f"epoch_{epoch}")
      #     os.makedirs(checkpoint_dir, exist_ok=True)
      #     model.save_pretrained(checkpoint_dir)

    test_data_loader=test_dataloader_kfold[k]
    for i, batch in enumerate(test_data_loader):
      model.eval()
      eval_loss=0.0
      # progress_bar_eval=tqdm(enumerate(test_data), total=len(test_data),desc=f"Epoch {epoch} - Fold {fold_idx}")
      # for step, batch in progress_bar_eval:
      batch={k:v.to(device) for k,v in batch.items()}
      with torch.no_grad():
        outputs=model(**batch)
        loss=outputs.loss
        eval_loss+=loss.item()
      if i%2==0:
        print(f"evaluation loss at epoch {epoch} batch {i}, fold{k} is : {eval_loss}")
    print(f" At the end evaluation loss at epoch {epoch} batch {i}, fold{k} is : {eval_loss}")
