# Test FAISS Indexing

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch

!pip install -quiet sentence-transformers datasets python-Levenshtein
!sudo apt-get install libomp-dev

if torch.cuda.is_available():
  !pip install -q faiss-gpu
else:
  !pip install -q faiss-cpu

import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [None]:
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DATASET = "blade57/ModelNumbers4Searching_Full"
#DATASET = "blade57/ModelNumber_small"
SEARCH_FIELD = 'model_search'
EMBED_FIELD  = 'embeddings'
CSV_FILE_NAME = 'ModelSearchWithEmbeddings_Full.csv'
FAISS_INDEX = "/content/drive/MyDrive/Colab Notebooks/Projects/Semantic Searching Model Identification/Full_Dataset/ModelSearch_Full.faiss"
DB_NO_EMBEDDINGS = "/content/drive/MyDrive/Colab Notebooks/Projects/Semantic Searching Model Identification/Full_Dataset/ModelNumbers4Searching_Full.csv"
DB_W_EMBEDDINGS = "/content/drive/MyDrive/Colab Notebooks/Projects/Semantic Searching Model Identification/Full_Dataset/ModelSearchWithEmbeddings_Full.csv"


In [None]:
# Load Model and Create Embedding Function
%%capture
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from Levenshtein import distance
import pandas as pd

model = SentenceTransformer(MODEL)


In [None]:
# functions

# embeddings
def create_embeddings(text):
  embeddings = model.encode([text])
  return embeddings

def query(search_text, return_no=10):
  search_embedding = create_embeddings(search_text)
  scores, search_results = ds.get_nearest_examples(EMBED_FIELD,
                                                   search_embedding,
                                                   k=return_no)
  return scores, search_results

def query_df(search_text, return_no=10):
  search_embedding = create_embeddings(search_text)
  scores, search_results = ds.get_nearest_examples(EMBED_FIELD,
                                                   search_embedding,
                                                   k=return_no)
  results = pd.DataFrame({
    'scores': scores,
    'model_search': search_results['model_search'],
    'model_number': search_results['model_number'],
    'model_name': search_results['model_name'],
    'brand': search_results['brand'],
    'search_for': search_text
  })
  return results, scores, search_results

def get_ls_rank(search1, search2):
  return distance(s1=str(search1).to_lower(),
                  s2=str(search2).to_lower()
                  )


In [None]:
# load dataset
ds = load_dataset(DATASET, split='train')

# load FAISS index for dataset
ds.load_faiss_index(EMBED_FIELD, FAISS_INDEX)

print(f"Records: {len(ds)}")

## Testing

In [None]:
# process results into df and result sets

search_for = 'A436BHT'
rows = 10
result_df, scores, results = query_df(search_for, rows)

# sort by scores (descending)
#result_df = result_df.sort_values(by=['scores'], ascending=False)
#result_df.head(rows)

# get LS rank and resort by LS rank (ascending)
result_df['LS_rank'] = result_df['model_search'].apply(lambda x: get_ls_rank(search_for, x))
result_df = result_df.sort_values(by=['LS_rank'], ascending=True)

result_df.head(rows)

In [None]:
# check against LS and resort

result_df['LS_rank'] = result_df['model_search'].apply(lambda x: get_ls_rank(search_for, x))
result_df = result_df.sort_values(by=['LS_rank'], ascending=True)

result_df.head(rows)