# I made this notebook to start in the middle because I don't want to keep regenerating the test data

In [2]:
# PIP installs
!pip install -q langchain_openai langchain_huggingface<0.1.0 
!pip install -q langchain_core==0.2.40 langchain==0.2.4 langchain_community langchain-text-splitters==0.2.4
!pip install -q qdrant_client pymupdf tiktoken ragas pandas
!pip install -q python-pptx==1.0.2 nltk==3.9.1


zsh:1: no such file or directory: 0.1.0


In [3]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

### Get the data sets that I already created

In [5]:
# Read embedding model datasets that have already been created and stored
import readDataSets
test_dataset, train_dataset, val_dataset = readDataSets.loadDataSets()


In [6]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id)

  from tqdm.autonotebook import tqdm, trange


In [7]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [8]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [9]:
BATCH_SIZE = 20
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

### Loss function

In [10]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

### Evaluator

In [11]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

### Training Time!

In [12]:
EPOCHS = 5
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

  0%|          | 0/130 [00:00<?, ?it/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'eval_cosine_accuracy@1': 0.74, 'eval_cosine_accuracy@3': 0.87, 'eval_cosine_accuracy@5': 0.96, 'eval_cosine_accuracy@10': 0.98, 'eval_cosine_precision@1': 0.74, 'eval_cosine_precision@3': 0.29, 'eval_cosine_precision@5': 0.19199999999999995, 'eval_cosine_precision@10': 0.09799999999999998, 'eval_cosine_recall@1': 0.74, 'eval_cosine_recall@3': 0.87, 'eval_cosine_recall@5': 0.96, 'eval_cosine_recall@10': 0.98, 'eval_cosine_ndcg@10': 0.8615543379824142, 'eval_cosine_mrr@10': 0.8233888888888888, 'eval_cosine_map@100': 0.8249365079365079, 'eval_dot_accuracy@1': 0.74, 'eval_dot_accuracy@3': 0.87, 'eval_dot_accuracy@5': 0.96, 'eval_dot_accuracy@10': 0.98, 'eval_dot_precision@1': 0.74, 'eval_dot_precision@3': 0.29, 'eval_dot_precision@5': 0.19199999999999995, 'eval_dot_precision@10': 0.09799999999999998, 'eval_dot_recall@1': 0.74, 'eval_dot_recall@3': 0.87, 'eval_dot_recall@5': 0.96, 'eval_dot_recall@10': 0.98, 'eval_dot_ndcg@10': 0.8615543379824142, 'eval_dot_mrr@10': 0.8233888888888888, 'e

In [13]:
import pandas as pd

from langchain_community.vectorstores import FAISS
# from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

In [14]:
import tqdm

def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm.tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

### Evaluate default embedder (text-embedding-3-small)

In [15]:
import defaults
te3_openai = defaults.default_embedding_model


In [17]:
!pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.7 kB)
Downloading faiss_cpu-1.8.0.post1-cp311-cp311-macosx_11_0_arm64.whl (6.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


In [18]:
te3_results = evaluate_openai(test_dataset, te3_openai)

  2%|▏         | 2/94 [00:00<00:15,  5.98it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 94/94 [00:17<00:00,  5.50it/s]


In [19]:
te3_results_df = pd.DataFrame(te3_results)
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

0.9361702127659575

### Evaluate base snowflake

In [20]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-m")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

  huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-m")
100%|██████████| 94/94 [00:06<00:00, 15.02it/s]


In [21]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

0.6702127659574468

### Evaluate fine tuned snowflake

In [22]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 94/94 [00:02<00:00, 33.12it/s]


In [23]:
finetune_results_df = pd.DataFrame(finetune_results)
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

0.9893617021276596