# **Downloading the dependencies**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
Col

# **Connecting google colab to drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Importing the necessary modules**

In [None]:
import json
import pandas as pd
import os
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset

In [None]:
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
num_classe=150

In [None]:
def test(model_path, data_path):
    solution_file_path=os.path.join(data_path,'surprise.solution')
    test_data_path=os.path.join(data_path,'massive_test.data')
    # loading surprise.solution file for getting id2label and label2id mapping
    with open(solution_file_path,'r') as solutions_file:
        solutions=[json.loads(line) for line in solutions_file] # reading json data from data_path and parse it into a test_data list

    labels_list=[]
    for label in solutions:
        labels_list.append(label['intent'])
    unique_labels_list=[]
    for x in labels_list:
        if x not in unique_labels_list:
            unique_labels_list.append(x)
    # unique_labels_list, len(unique_labels_list)

    label2id={}
    id2label={}
    for i, label in enumerate(unique_labels_list):
        label2id[label]=i
        id2label[i]=label
    # print(list(id2label.items())[:5])
    # print('\n')
    # print(list(label2id.items())[:5])
    # loading testing data file
    with open(test_data_path,'r') as test_file:
        test_data=[json.loads(line) for line in test_file] # reading json data from data_path and parse it into a test_data list

    num_classes=150
    # loading pretrained tokenizer
    tokenizer=RobertaTokenizer.from_pretrained(model_path)
    test_utt=[item['utt'] for item in test_data]
    test_data_encodings=tokenizer(test_utt, padding=True, truncation=True, return_tensors="pt") # getting the encodings of testing data

    # Convert the encodings into tensors
    input_ids = test_data_encodings['input_ids']
    attention_mask = test_data_encodings['attention_mask']
    # token_type_ids = test_set_encodings['token_type_ids'] # may be useful if our pretrained model is of type then roberta like BERT

    # Create a TensorDataset
    test_dataset = TensorDataset(input_ids, attention_mask,)

    # Define batch size
    batch_size = 32

    # Create a DataLoader
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # loading the pretrained model
    model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=num_classes, ignore_mismatched_sizes=True)
    model.to(device)

        # Initialize an empty list to store predictions
    predictions = []

    # Set the model in evaluation mode
    model.eval()

    # Iterate through the batches in the DataLoader
    for batch in test_dataloader:
        # Unpack the batch
        input_ids, attention_mask = batch

        # Move tensors to the device (e.g., GPU if available)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)


        # Forward pass to get logits
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the logits tensor from the outputs
        logits = outputs.logits

        # Apply softmax to get class probabilities
        probabilities = torch.softmax(logits, dim=1)

        # Get the predicted class (index with the highest probability)
        predicted_class = torch.argmax(probabilities, dim=1)

        # Append the predicted class to the list of predictions
        predictions.extend(predicted_class.tolist())

    predictions=torch.tensor(predictions) # predicted id for all the utterance of the testing data

    predicted_labels=[id2label[int(id)] for id in list(predictions)] # converting those id into labels using id2label mapping made above

    # converting these label with their id into pandas Dataframe
    my_id=[]
    my_intent=[]
    my_dict1={}

    for i, entry in enumerate(test_data):
      my_id.append({'indoml_id':i+1,'intent':predicted_labels[i]})
      my_intent.append(predicted_labels[i])
      my_dict1['id']=my_id
      my_dict1['intent']=my_intent


    my_dict1_pd=pd.DataFrame.from_dict(my_dict1)
    print(my_dict1_pd)

    # Converting the predictions into the desired format taken from the 1st column of my_dict1_pd dataframe
    # This output.predict file will got saved in the same directory where this jupyter file is present, you can also change the path of this where you want to save it accordingly.
    with open('output.predict', 'w') as out_file:
      for entry in my_dict1_pd['id']:
          out_file.write(str(entry))
          out_file.write('\n')


In [None]:
# model_path="C:/Users/panka/Downloads/epoch_16-20231021T144106Z-001/epoch_16/"
# data_path="C:/Users/panka/Desktop/IndoML/input_data_latest/indoml_phase2_data/"
model_path="/content/drive/MyDrive/massive_accuracy_files_in_descending_order/intent_classification_It_bombay/trained_model_11_0.25_data_split_lr_4e_5_checkpoints/epoch_16"
data_path="/content/drive/MyDrive/massive_accuracy_files_in_descending_order/intent_classification_It_bombay (1)/indoml_iit_bombay/surprise_data/" # Directory or folder containing paths of all the files related to surprise data and massive testing data.
out_file=test(model_path, data_path)
# Convert the list of predictions to a tensor



                                                     id  \
0               {'indoml_id': 1, 'intent': 'what song'}   
1           {'indoml_id': 2, 'intent': 'change volume'}   
2                    {'indoml_id': 3, 'intent': 'time'}   
3              {'indoml_id': 4, 'intent': 'smart home'}   
4                {'indoml_id': 5, 'intent': 'carry on'}   
...                                                 ...   
5995            {'indoml_id': 5996, 'intent': 'cancel'}   
5996          {'indoml_id': 5997, 'intent': 'timezone'}   
5997         {'indoml_id': 5998, 'intent': 'roll dice'}   
5998          {'indoml_id': 5999, 'intent': 'carry on'}   
5999  {'indoml_id': 6000, 'intent': 'restaurant rese...   

                      intent  
0                  what song  
1              change volume  
2                       time  
3                 smart home  
4                   carry on  
...                      ...  
5995                  cancel  
5996                timezone  
5997        