# FAISS Index Creation

In [1]:
import torch
from timeit import default_timer as timer
start = timer()


In [2]:
%%capture
!pip install sentence-transformers datasets
!sudo apt-get install libomp-dev

if torch.cuda.is_available():
  !pip install faiss-gpu
else:
  !pip install faiss-cpu

In [3]:
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DATASET = "blade57/ModelNumbers4Searching_Full"
#DATASET = "blade57/ModelNumber_small"
SEARCH_FIELD = 'model_search'
EMBED_FIELD  = 'embeddings'
EMBED_FILE_NAME = 'ModelSearch_Full.faiss'
CSV_FILE_NAME = 'ModelSearchWithEmbeddings_Full.csv'

In [4]:
import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [5]:
%%capture
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(MODEL)

def create_embeddings(text):
  embeddings = model.encode([text])
  return embeddings

In [6]:
# load dataset and create embeddings
from datasets import load_dataset

%timeit
#ds = load_dataset(DATASET, split='train[:100]')
ds = load_dataset(DATASET, split='train')
ds_with_embeddings = ds.map(lambda example: {EMBED_FIELD: create_embeddings(example[SEARCH_FIELD])[0]})

Downloading readme:   0%|          | 0.00/629 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/2.69M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
# save dataset
import pandas as pd

df = pd.DataFrame(ds_with_embeddings)
df.to_csv(CSV_FILE_NAME, index=False)
df.head()


Unnamed: 0,brand,model_number,model_name,year,randomdata,model_search,embeddings
0,Landini,L4240HSTC,Hydraulic Excavator,2017,1439,L4240HSTC,"[-0.019833004102110863, 0.03396640717983246, -..."
1,John Deere,LS1401203,4WD Tractor,2007,1203,LS1401203,"[-0.10737817734479904, -0.012116434052586555, ..."
2,Volvo,R40441789,Wheel Loader,2017,1789,R40441789,"[-0.1115589290857315, -0.01163890678435564, -0..."
3,Volvo,Lexion 520,4WD Tractor,2012,1415,Lexion520,"[-0.1187746599316597, -0.07215884327888489, -0..."
4,Caterpillar,9570RT,2WD Tractor,2005,1531,9570RT,"[-0.015632275491952896, 0.04326120764017105, -..."


In [8]:
# create index on embedding column

%%timeit
ds_with_embeddings.add_faiss_index(column=EMBED_FIELD)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

236 ms ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
# save index

ds_with_embeddings.save_faiss_index(EMBED_FIELD, EMBED_FILE_NAME)

## End

In [10]:
end = timer()
print('FAISS index created')
print(f"Time to create (seconds): {end - start}")

FAISS index created
Time to create (seconds): 527.7821558420001
