In [None]:
# ! wget -P ~/ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
# ! chmod +x ~/Miniconda3-latest-Linux-x86_64.sh
# ! ~/Miniconda3-latest-Linux-x86_64.sh -b
# ! export PATH=~/miniconda3/bin:$PATH
# ! conda init & conda config --set auto_activate_base false
# # close and start a new session
# ! conda activate base
# ! conda install cudatoolkit=11.0 -y
# !pip install sentence-transformers   transformers datasets peft accelerate bitsandbytes faiss-cpu faiss-gpu

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from sentence_transformers import SentenceTransformer, util
import faiss
import pandas as pd
import torch

if torch.backends.mps.is_available():  # Check for Apple Silicon GPU availability (requires PyTorch 1.12 or later)
    device = torch.device("mps")
elif torch.cuda.is_available():  # Check for NVIDIA GPU availability
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # Fall back to CPU

print(f"Using device: {device}")

In [None]:
dataset = load_dataset('ms_marco', 'v1.1')
print(dataset)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [None]:
# Assuming dataframe is your DataFrame containing passages
print(train_dataset['passages'][:1])
# passages_list = train_dataset['passages']['passage_text'].tolist()

unique_passages = set()
for row in train_dataset:
    unique_passages.update(row['passages']['passage_text'])
print(len(unique_passages))
documents = list(unique_passages)

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
# SentenceTransformer("all-MiniLM-L6-v2")
SentenceTranformer = SentenceTransformer(
    'sentence-transformers/msmarco-bert-base-dot-v5',
    device = device,
    )

In [None]:
query_embedding = SentenceTranformer.encode('How big is London')
print(len(query_embedding))
document_embedding = SentenceTranformer.encode(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district'])

print("Similarity:", util.dot_score(query_embedding, document_embedding))

# Implement a Sentence Transformer


In [30]:
# Encode documents
document_embeddings = SentenceTranformer.encode(
    documents, 
    show_progress_bar=True, 
    device = device,
    batch_size=100
)

Batches: 100%|██████████| 6270/6270 [3:07:58<00:00,  1.80s/it]  


In [31]:

# Build Faiss index
index = faiss.IndexFlatL2(document_embeddings.shape[1])  # L2 distance
index.add(document_embeddings)

In [35]:
# Save the index to a file
faiss.write_index(index, "index_docs.index")
# Load the index from a file
# index = faiss.read_index("index_docs.index")

In [53]:

# Query
query = "This is a query document."
query_embedding = SentenceTranformer.encode([query])

# Perform document similarity search
k = 10  # Number of similar documents to retrieve
D, I = index.search(query_embedding, k)

# Print similar documents
print("Most similar documents to the query:")
for i, idx in enumerate(I[0]):
    print(f"Rank {i+1}: {documents[idx]}")

Most similar documents to the query:
Rank 1: A query language is a language in which a user requests information from the database. These languages are usually on a level higher than that of a standard programming lang … uage. Query languages can be categorized as either procedural or non procedural.
Rank 2: Overview. Structured Query Language (SQL) is a specialized language for updating, deleting, and requesting information from databases. SQL is an ANSI and ISO standard, and is the de facto standard database query language. 
Rank 3: A system specification document is used to present the functions, performance and limitations of a software product or system.
Rank 4: The PUB Document file format. PUB is the file extension which is generally used by the Microsoft Publisher application which is a part of the Microsoft Office product set. This file format can comprise various objects such as graphics, images, formatted text, or any other kind of object. 
Rank 5: Query Definition: Stored P

In [None]:

# Set Up Approximate Nearest Neighbors (ANN) Using Faiss
# Select and Prepare a Pre-trained Seq2Seq Model
# Generate the Response
# Evaluation and Iteration
