# **Using Encoder-only LLM to do Similarity Recommendation/Duplicate Detection**
Using Siamese Network/Sentence Transformer and FAISS (for VectorDB)

**Author: Partha Seetala**

Video Tutorial: [https://youtu.be/UJZ4HGLnSMU](https://youtu.be/UJZ4HGLnSMU)

In [None]:
!pip install pandas scikit-learn torch sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-m

In [None]:
import os
import pickle
from itertools import combinations
import random
import torch

import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import faiss
from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    losses,
    evaluation
)

In [None]:
from google.colab import drive
import os
from pathlib import Path

drive.mount('/content/drive')

USECASE_NAME = "s3e2-similarity-recommendation"

qabugs_data = "similarity-qabugs.csv"
gitbugs_data = "similarity-gitbugs.csv"
quora_data =  "similarity-quora.parquet"

DATASET_TYPE = "quora"

if DATASET_TYPE == "qabugs":
    selected_dataset = qabugs_data
elif DATASET_TYPE == "gitbugs":
    selected_dataset = gitbugs_data
elif DATASET_TYPE == "quora":
    selected_dataset = quora_data
else:
    raise ValueError(f"Unknown dataset type: {DATASET_TYPE}")

DATASET_NAME = Path(selected_dataset).stem

MODEL_NAME = "all-MiniLM-L6-v2"
max_seq_len = 128

ROOTDIR = '/content/drive/MyDrive/cidl'

HF_TOKEN_FILEPATH = os.path.join(ROOTDIR, "hf.token")
DATASET_DIR = os.path.join(ROOTDIR, 'datasets')  # ~/cidl/datasets
PRETRAINED_MODEL_DIR = os.path.join(ROOTDIR, 'models', 'pretrained', 'bert')  # ~/cidl/models/pretrained/bert
FINETUNED_MODEL_DIR = os.path.join(ROOTDIR, 'models', 'finetuned', 'bert', USECASE_NAME, DATASET_NAME, "model")  # ~/cidl/models/finetuned/bert/<usecase-name>/<dataset-name>
VECTORDB_DIR = os.path.join(ROOTDIR, 'models', 'finetuned', 'bert', USECASE_NAME, DATASET_NAME, "vectordb")
VECTORDB_FILE = os.path.join(VECTORDB_DIR, "vdb")

def load_hugging_face_token(hf_token_filepath=HF_TOKEN_FILEPATH):
    if os.path.exists(hf_token_filepath):
        with open(hf_token_filepath, "r") as f:
            hf_token = f.read().strip()

        if hf_token.startswith("hf_"):
            # Login and set environment variable
            os.environ["HF_TOKEN"] = hf_token
            #login(token=hf_token, add_to_git_credential=False)
        else:
            print(f"Malformed Hugging Face token file at: {hf_token_filepath}")
    else:
        print(f"Hugging Face token file not found at: {hf_token_filepath}")

for dirpath in [DATASET_DIR, PRETRAINED_MODEL_DIR, FINETUNED_MODEL_DIR, VECTORDB_DIR]:
    os.makedirs(dirpath, exist_ok=True)

load_hugging_face_token(HF_TOKEN_FILEPATH)

print("DATASET DIR ............. ", DATASET_DIR)
print("MODEL NAME .............. ", MODEL_NAME)
print("PRETRAINED MODEL DIR .... ", PRETRAINED_MODEL_DIR)
print("FINETUNED MODEL DIR ..... ", FINETUNED_MODEL_DIR)
print("VECTORDB DIR ............ ", VECTORDB_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DATASET DIR .............  /content/drive/MyDrive/cidl/datasets
MODEL NAME ..............  all-MiniLM-L6-v2
PRETRAINED MODEL DIR ....  /content/drive/MyDrive/cidl/models/pretrained/bert
FINETUNED MODEL DIR .....  /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-similarity-recommendation/similarity-quora/model
VECTORDB DIR ............  /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-similarity-recommendation/similarity-quora/vectordb


# **Prepare Training, Validation and Testing Dataset**

In [None]:
def prepare_training_and_test_data(dataset_file, dataset_type, val_size=0.1, test_size=0.1, random_state=42):
    if dataset_type == "qabugs":
        print(f"Loading dataset from file: {dataset_file}")
        df = pd.read_csv(dataset_file)
    elif dataset_type == "gitbugs":
        print(f"Loading dataset from file: {dataset_file}")
        df = pd.read_csv(dataset_file)
    elif dataset_type == "quora":
        # Check if the file already exists
        if not os.path.exists(dataset_file):
            # Download from HF and save to the dataset_file
            hf_path = "hf://datasets/AlekseyKorshuk/quora-question-pairs/data/train-00000-of-00001.parquet"
            print(f"Downloading dataset from Hugging Face location {hf_path}...")
            df = pd.read_parquet(hf_path)
            print(f"Saving dataset to local cache: {dataset_file}")
            df.to_parquet(dataset_file, index=False)
        else:
            # Load from local cache
            print(f"Loading dataset from file: {dataset_file}")
            df = pd.read_parquet(dataset_file)

        # Rename columns
        df = df.rename(columns={
            "question1":    "text_a",
            "question2":    "text_b",
            "is_duplicate": "score"
        })
    else:
        raise ValueError(f"Unknown file format for dataset file {dataset_file}")

    df_train_val, df_test = train_test_split(df, test_size=test_size, random_state=random_state)
    val_rel = val_size / (1 - test_size)
    df_train, df_val = train_test_split(df_train_val, test_size=val_rel, random_state=random_state)

    def to_examples(sub_df):
        a = sub_df['text_a'].values
        b = sub_df['text_b'].values
        s = sub_df['score'].astype(float).values
        return [
            InputExample(texts=[ta, tb], label=sc)
            for ta, tb, sc in zip(a, b, s)
        ]

    return to_examples(df_train), to_examples(df_val), to_examples(df_test)

# **Main Functions to Finetune and do Inference on Sentence Transformer**

In [None]:
def load_pretrained_similarity_model(model_dir=PRETRAINED_MODEL_DIR, model_name=MODEL_NAME):
    os.environ['SENTENCE_TRANSFORMERS_HOME'] = model_dir
    print("Loading SentenceTransformer('{}')".format(model_name))
    model = SentenceTransformer(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    return model

def finetune_similarity_model(model, model_dir, train, val, epochs=1, batch_size=16, lr=2e-5):
    train_dataloader = DataLoader(train, shuffle=True, batch_size=batch_size)
    train_loss = losses.CosineSimilarityLoss(model)
    evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(val, name="val-eval")

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=epochs,
        warmup_steps=100,
        optimizer_params={"lr": lr},
        output_path=model_dir,
        show_progress_bar=True
    )

def save_finetuned_similarity_model(model, finetuned_path):
    model.save(finetuned_path)

def load_finetuned_similarity_model(model_dir=FINETUNED_MODEL_DIR):
    print(f"Loading finetuned model from {model_dir}")
    model = SentenceTransformer(model_dir)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    return model

def find_similar(model, vdb, sentence, topk=10, threshold=0.6):

    def get_embedding(model, sentence):
        emb = model.encode([sentence], convert_to_numpy=True)
        faiss.normalize_L2(emb)
        return emb[0]

    embedding = get_embedding(model, sentence)

    D, I = vdb["index"].search(embedding.reshape(1, -1), topk)
    results = []
    for idx, score in zip(I[0], D[0]):
        if threshold is not None and score < threshold:
            break
        text = vdb["texts"][idx]
        if text == sentence:
            continue
        results.append((text, float(score)))
    return results

# **Main code to Build, Save and Query VectorDB**

In [None]:
def build_similarity_database(model, sentences):
    # collect all texts
    texts = []
    for sent in sentences:
        texts.extend(sent.texts)
    # remove duplicates
    texts = list(dict.fromkeys(texts))

    # encode & normalize
    embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    faiss.normalize_L2(embeddings)

    # build index
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)  # cosine = inner product on normalized vectors
    index.add(embeddings)

    vdb = {"index": index, "texts": texts}
    return vdb

def save_similarity_database(vdb, vdbfilepath):
    faiss.write_index(vdb["index"], vdbfilepath + ".index")
    with open(vdbfilepath + ".texts.pkl", "wb") as f:
        pickle.dump(vdb["texts"], f)


def load_similarity_database(vdbfilepath):
    print("Loading Similarity VectorDB from ", vdbfilepath)
    index = faiss.read_index(vdbfilepath + ".index")
    with open(vdbfilepath + ".texts.pkl", "rb") as f:
        texts = pickle.load(f)
    vdb = {"index": index, "texts": texts}
    return vdb


## **Finetuning an already pre-trained SentenceTransformer**

**Prepare finetuning dataset**

In [None]:
DATASET_FILE_PATH = os.path.join(DATASET_DIR, selected_dataset)
train_ds, val_ds, _ = prepare_training_and_test_data(dataset_file=DATASET_FILE_PATH, dataset_type=DATASET_TYPE)

print("Dataset type ......... ", DATASET_TYPE)
print("Training dataset ..... ", len(train_ds))
print("Validation dataset ... ", len(val_ds))

**Load pre-trained SentenceTransformer and Finetune and then save it**

In [None]:
os.environ["WANDB_DISABLED"] = "true"

print("Loading pre-trained Sentence Transformer")
model = load_pretrained_similarity_model(model_dir=PRETRAINED_MODEL_DIR, model_name=MODEL_NAME)

print("Finetuning Sentence Transformer")
finetune_similarity_model(model, PRETRAINED_MODEL_DIR, train_ds, val_ds, epochs=1)

print("Saving finetuned Sentence Transfomer under ", FINETUNED_MODEL_DIR)
save_finetuned_similarity_model(model, FINETUNED_MODEL_DIR)

Loading pre-trained Sentence Transformer
Loading SentenceTransformer('all-MiniLM-L6-v2')
Finetuning Sentence Transformer


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Val-eval Pearson Cosine,Val-eval Spearman Cosine
500,0.1648,,,
1000,0.1453,,,
1500,0.1367,,,
2000,0.1348,,,
2500,0.1317,,,
3000,0.1285,,,
3500,0.1268,,,
4000,0.1245,,,
4500,0.1193,,,
5000,0.1237,,,


Saving finetuned Sentence Transfomer under  /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-similarity-recommendation/similarity-quora/model


**Build VectorDB for future similarity searches and save it**

In [None]:
print("Building Similarity Vector Database")
vdb = build_similarity_database(model, train_ds + val_ds)

print("Saving Similarity Vector Database to ", VECTORDB_FILE)
save_similarity_database(vdb, VECTORDB_FILE)

Building Similarity Vector Database


Batches:   0%|          | 0/15433 [00:00<?, ?it/s]

Saving Similarity Vector Database to  /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-similarity-recommendation/similarity-quora/vectordb/vdb


# **Running Inferencing**
**Using our Finetuned SentenceTransformer LLM and Similarity VectorDB**

In [None]:
model = load_finetuned_similarity_model(FINETUNED_MODEL_DIR)
vdb = load_similarity_database(VECTORDB_FILE)

Loading finetuned model from /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-similarity-recommendation/similarity-quora/model
Loading Similarity VectorDB from  /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-similarity-recommendation/similarity-quora/vectordb/vdb


In [None]:
DATASET_FILE_PATH = os.path.join(DATASET_DIR, selected_dataset)
_, _, test_ds = prepare_training_and_test_data(dataset_file=DATASET_FILE_PATH, dataset_type=DATASET_TYPE)

Loading dataset from file: /content/drive/MyDrive/cidl/datasets/similarity-quora.parquet


**Find similar texts**

In [None]:
for row in test_ds[:10]:
    sentence = row.texts[0]
    sims = find_similar(model, vdb, sentence, topk=10)
    print(f"\nQuery: {sentence}")
    for text, score in sims:
        pscore = score * 100
        print(f"  → {text[:80]}  [{pscore:.1f}%]")


Query: How do I play Pokémon GO in Korea?
  → How should I celebrate my birthday in Korea?  [87.1%]
  → How can I self-publish a book in Korea?  [83.0%]
  → Are there any good incubators or accelerators in Korea?  [80.6%]
  → What factors are stifling the startup entrepreneurship ecosystem in Korea?  [77.1%]
  → Are there community colleges in Korea?  [76.5%]
  → What are the best mobile ad networks in Korea?  [75.7%]
  → What is the longest roller coaster in the United Kingdom and how does it compare  [73.7%]
  → What is a good company to invest in in Korea?  [73.6%]
  → Is there any Socket Server made in Korea?  [72.9%]

Query: What are some of the best side dishes for crab cakes?
  → What are the best side dishes to serve with crab cakes for dinner?  [96.4%]
  → How long do crab cakes last in the fridge? Is there something I could do to make  [90.9%]
  → What are some great side dishes to serve with crab salad?  [88.7%]
  → What are some good side dishes to serve with crab legs?  [