# Model Number Testing using Semantic Similarity and Levenshtein Distance

## Setup

In [None]:
import torch

!pip install -q sentence-transformers datasets python-Levenshtein
!sudo apt-get install libomp-dev

if torch.cuda.is_available():
  !pip install -q faiss-gpu
  !nvidia-smi
else:
  !pip install -q faiss-cpu

import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [None]:
# clone repo
import os
from pathlib import Path

data_path = Path("data/")

if data_path.is_dir():
  print("No need to clone repo")
else:
  !git clone https://github.com/nicholassolomon/ModelNumberSearch.git
  data_path.mkdir(parents=True, exist_ok=True)
  !mv ModelNumberSearch/Data/*.* data
  !rm -rf ModelNumberSearch


In [None]:
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DATASET = "blade57/ModelNumbers4Searching_Full"
SEARCH_FIELD = 'model_search'
EMBED_FIELD  = 'embeddings'
CSV_FILE_NAME = '/content/data/ModelNumbers4Searching_Full.csv'
FAISS_INDEX = "/content/data/ModelSearch_Full.faiss"


In [None]:
# Load Model and Create Embedding Function
%%capture
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from Levenshtein import distance
import pandas as pd

model = SentenceTransformer(MODEL)


In [None]:
# functions

# embeddings
def create_embeddings(text):
  """
  Creates an embedding from a given text using the model

  Args:
    text: The text to be embedded.

  Returns:
    A list containing the embedding of the text.
  """
  embeddings = model.encode([text])
  return embeddings

def query(search_text, return_no=10):
  """
  Queries the dataset for the most similar model numbers to the search text using the SentenceTransformers model and the Faiss index.

  Args:
    search_text: The text to be used for the search.
    return_no: The number of results to return.

  Returns:
    A tuple containing the scores and the search results.
  """
  search_embedding = create_embeddings(search_text)
  scores, search_results = ds.get_nearest_examples(EMBED_FIELD,
                                                   search_embedding,
                                                   k=return_no)
  return scores, search_results

def query_df(search_text, return_no=10):
  """
  Queries the dataset for the most similar model numbers to the search text using the SentenceTransformers model and the Faiss index.
  Returns the results in a pandas dataframe.

  Args:
    search_text: The text to be used for the search.
    return_no: The number of results to return.

  Returns:
    A tuple containing the scores, the search results, and a pandas dataframe containing the results.
  """
  search_embedding = create_embeddings(search_text)
  scores, search_results = ds.get_nearest_examples(EMBED_FIELD,
                                                   search_embedding,
                                                   k=return_no)
  results = pd.DataFrame({
    'scores': scores,
    'model_search': search_results['model_search'],
    'model_number': search_results['model_number'],
    'model_name': search_results['model_name'],
    'brand': search_results['brand'],
    'search_for': search_text
  })
  return results, scores, search_results

def get_ls_rank(search1, search2):
  """
  Calculates the Levenshtein distance between two strings.

  Args:
    search1: The first string.
    search2: The second string.

  Returns:
    The Levenshtein distance between the two strings.
  """
  return distance(s1=str(search1).lower(),
                  s2=str(search2).lower()
                  )


In [None]:
# load dataset
ds = load_dataset(DATASET, split='train')

# load FAISS index for dataset
ds.load_faiss_index(EMBED_FIELD, FAISS_INDEX)

print(f"Records: {len(ds)}")

## Testing

In [None]:
# process results into df and result sets

search_for = 'X9570RT'  # the actual model is 9570RT
rows = 100
result_df, scores, results = query_df(search_for, rows)

# get LS rank and resort by LS rank (ascending)
result_df['LS_rank'] = result_df['model_search'].apply(lambda x: get_ls_rank(search_for, x))
result_df = result_df.sort_values(by=['LS_rank'], ascending=True)

result_df.head(rows)