This file contains the semantic search function that takes in a query, a list of documents, and a model,
and returns the document with the highest dot product similarity to the query.
The purpose is to match the user's input (query) to the options of buttons that we are trying to match the user's input (query) to.
This will allow to make the chatbot accept open-text user input and match it to the pre-set options of buttons.

In this notebook, we are testing the performance of various semantic search models for a semantic button mapping task in chatbots.

Author: @olivcha

# Install prerequisites

In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[

For using with Google Drive

In [2]:
#from google.colab import drive
#drive.mount('/content/drive/')

Mounted at /content/drive/


# Test code

In [3]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

query_embedding = model.encode('How big is London')
passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Downloading (…)5fedf/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)2cb455fedf/README.md:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading (…)b455fedf/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)edf/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5fedf/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)fedf/train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading (…)2cb455fedf/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)455fedf/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Similarity: tensor([[0.5472, 0.6330]])


Install prerequisite libraries

In [4]:
from pandas import *
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util


# Read the dataset
Split into input, class names, and label

In [5]:
data = read_excel("drive/MyDrive/Semantic_search/SEMANTIC_MAPPING.xlsx")


In [None]:
data

In [7]:
responses = data['Response'].dropna()
docs = data['Docs'].dropna()
labels = data['Label'].dropna()

In [8]:
print(type(docs[0]))

<class 'str'>


Split the class names into separate strings

In [None]:
for index, line in docs.items():
  print(line.split(';'))
  docs[index] = line.split(';')

In [10]:
type(docs[0][0])

str

Classes for the instance

In [11]:
docs

0      [sad, angry, anxious, happy, OOD]
1      [sad, angry, anxious, happy, OOD]
2      [sad, angry, anxious, happy, OOD]
3      [sad, angry, anxious, happy, OOD]
4      [sad, angry, anxious, happy, OOD]
                     ...                
203     [Yes, No, Choose AUC again, OOD]
204     [Yes, No, Choose AUC again, OOD]
205     [Yes, No, Choose AUC again, OOD]
206     [Yes, No, Choose AUC again, OOD]
207     [Yes, No, Choose AUC again, OOD]
Name: Docs, Length: 179, dtype: object

Test if GPU available

In [12]:
import tensorflow as tf
print("GPU Available:", tf.config.list_physical_devices('GPU'))



GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Generation of the top semantic matching doc from docs as compared with the query.

The function semantic_search() takes the query, docs, and a model.

In [13]:
# Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Encode text
def encode(texts, model, tokenizer):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)

    return embeddings

def semantic_search(query, docs, select_model, threshold=0.2):
    '''
    This function takes in a query, a list of documents, and returns a tuple consisting of
    the document with the highest cosine similarity to the query and the score.
    '''

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained(select_model)
    model = AutoModel.from_pretrained(select_model)

    # Preprocess
    for i in docs:
      if i == "OOD":
        docs.remove("OOD")

    query = str(query)

    #Encode query and docs
    query_emb = encode(query, model, tokenizer)
    doc_emb = encode(docs, model, tokenizer)

    #Compute dot score between query and all document embeddings
    scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()

    #Combine docs & scores
    doc_score_pairs = list(zip(docs, scores))

    #Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

    #Output passages & scores
    # for doc, score in doc_score_pairs:
        # print(score, doc)

    top_score = doc_score_pairs[0][1]
    top_label = doc_score_pairs[0][0]

    if top_score < threshold:
      top_label = "OOD"

    return top_score, top_label

In [14]:
models = [
    "sentence-transformers/msmarco-bert-base-dot-v5",
    "sentence-transformers/multi-qa-mpnet-base-cos-v1",
    "sentence-transformers/multi-qa-distilbert-cos-v1",
    "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
]

thresholds = [0.25, 0.30, 0.35]

In [15]:
import time
import csv

In [None]:
# will store all predictions of all models
all_predictions = []
all_times = []
total_predictions = len(labels)
all_accuracies = []
data = []

for model in models:

  # will store all the predictions for the model for all threshold values
  model_predictions = []
  # will store all the time measurements for the model for all threshold values
  model_times = []

  for threshold in thresholds:

    # Will store all the predictions for the model for this threshold
    threshold_predictions = []
    # Will store all the time measurements for the model for this threshold
    threshold_times = []

    for index, item in responses.items():
      print(index)
      start_time = time.time()  # Start measuring time
      top_score, top_doc = semantic_search(responses[index], docs[index], model, threshold)
      end_time = time.time()    # End measuring time
      time_taken = end_time - start_time
      threshold_predictions.append(top_doc)
      threshold_times.append(time_taken)
      print("Prediction: " + str(top_doc) + " Label: " + str(labels[index]) + " Time: " + str(time_taken) + " seconds")

    # Calculate accuracy for this model for this threshold
    correct_predictions = sum(1 for true, pred in zip(labels, threshold_predictions) if true == pred)
    accuracy = correct_predictions / total_predictions

    # Calculate average time for this model for this threshold
    avg_time = sum(threshold_times) / len(threshold_times)

    all_accuracies.append(accuracy)
    print("Model: " + str(model) + " Threshold: " + str(threshold) + " Accuracy: " + str(accuracy) + " Average Time: " + str(avg_time) + " seconds")
    data.append([model, threshold, accuracy, avg_time])

    # append all the predictions from diff thresholds to the model pred
    model_predictions.append(threshold_predictions)
    # Append all the time measurements from different thresholds to the model times
    model_times.append(threshold_times)

  # append all the predictions from all the models to the preds
  all_predictions.append(model_predictions)
  # Append all the time measurements from all the models to the all_times
  all_times.append(model_times)



# Save data into file

Write into 4 columns with model name, threshold value, accuracy score, and average time to compute the prediction.



In [17]:
# Define the path where you want to save the CSV file
csv_file_path = 'model_threshold_accuracy_time_1.csv'

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Model', 'Threshold', 'Accuracy', 'Average Time'])
    csv_writer.writerows(data)

print("Data saved to:", csv_file_path)

Data saved to: drive/MyDrive/Semantic_search/model_threshold_accuracy_time_1.csv


# Computing general accuracy

Using the following equation:
Accuracy = (number of correctly assigned labels) / (number of instances)

# Split labels into categories
Categories include: YES/NO, CONTINUE, and OTHERS