<a href="https://colab.research.google.com/github/melrahmtz/purple-box/blob/main/hands-on-practice/vectordb_supabase/2102_vector_embedding_supabase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Do not run this in Colab, run this in SQL Editor on Supabase. Different model requires adjustment to this SQL.

In [None]:
-- Enable the pgvector extension to work with embedding vectors
create extension if not exists vector;

-- Create a table to store your documents
create table
  documents (
    id uuid primary key,
    content text, -- corresponds to Document.pageContent
    metadata jsonb, -- corresponds to Document.metadata
    embedding vector (384) -- 1536 works for OpenAI embeddings, change if needed
  );

-- Create a function to search for documents, returning the embedding vector
CREATE OR REPLACE FUNCTION match_documents (
  query_embedding vector (384),
  match_count int DEFAULT null,  -- Allows specifying the number of matches
  filter jsonb DEFAULT '{}'     -- Allows filtering by metadata
)
RETURNS TABLE (
  id uuid,
  content text,
  metadata jsonb,
  similarity float,
  embedding vector(384)        -- **Crucial: Returns the embedding vector**
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
begin
  return query
  select
    id,
    content,
    metadata,
    1 - (documents.embedding <=> query_embedding) as similarity,
    embedding                  -- **Crucial: Selects the embedding vector**
  from documents
  where metadata @> filter
  order by documents.embedding <=> query_embedding
  LIMIT match_count;          -- Applies the match count limit
end;
$$;

# **Python Code**
Source: [Supabase (Postgres) LangChain](https://python.langchain.com/docs/integrations/vectorstores/supabase/)


In [1]:
!pip install python-dotenv --quiet
!pip install langchain --quiet
!pip install supabase --quiet
!pip install tiktoken --quiet
!pip install unstructured --quiet
!pip install numpy --quiet
!pip install transformers --quiet
!pip install -U langchain-community  --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.1/41.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.6/167.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m10.4 MB/s[0m eta [36

In [33]:
import getpass
import os

if "HUGGINGFACEHUB_API_KEY" not in os.environ:
    os.environ["HUGGINGFACEHUB_API_KEY"] = getpass.getpass("HuggingFace API Key:")

if "SUPABASE_URL" not in os.environ:
    os.environ["SUPABASE_URL"] = getpass.getpass("Supabase URL:")

if "SUPABASE_SERVICE_KEY" not in os.environ:
    os.environ["SUPABASE_SERVICE_KEY"] = getpass.getpass("Supabase Service Key:")

HuggingFace API Key:··········


In [3]:
# If you're storing your Supabase and OpenAI API keys in a .env file, you can load them with dotenv
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

# **`all-MiniLm-L6-v2` by Sentence Transformers**

In [4]:
import os
import numpy
from supabase.client import Client, create_client
from langchain.vectorstores import SupabaseVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings

# openai_key  = os.environ['OPENAI_API_KEY']
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownTextSplitter

# Load the Markdown file
markdown_path = "PDF1(updated).md"  # Replace with your file path
loader = TextLoader(markdown_path)
documents = loader.load()

# Split the document into chunks using MarkdownTextSplitter
text_splitter = MarkdownTextSplitter(chunk_size=2048, chunk_overlap=600) # Experience with the chunk_size and chunk_overlap
docs = text_splitter.split_documents(documents)

print(f"Loaded {len(docs)} chunks")


Loaded 40 chunks


In [6]:
vector_store = SupabaseVectorStore.from_documents(
    docs,
    embeddings,
    client=supabase,
    table_name="documents_minilm",
    query_name="match_documents_minilm",
    chunk_size=500,
)

In [7]:
query = "What is Sentiment Analysis"
#query = "What is the accuracy of the LDA model when using the \"daily weighted average\" sentiment score?"
#query = "What is the characteristic of the sentiment score distribution?"

matched_docs = vector_store.similarity_search(query)
print(matched_docs[0].page_content)

After a review of the literature on NLP and Changes of Measure in section 2, a detailed overview of data and methodology is presented in section 3, which includes the developments from previous sentiment equation to more sophisticated ones along with enhanced NLP forecasting methods. Building upon these foundations, we introduce in section 4, a novel probability measure, that we name ' Hype-Adjusted Probability Measure", designed to capture the occurrence of market 'hype." The results and future work are discussed in section 5.

## 2 Literature Review

Sentiment analysis, a subfield of Natural Language Processing (NLP), focuses on quantifying the emotional tone and intent conveyed in textual data. NLP itself is a rapidly growing area of machine learning (ML) that enables computers to process and understand human language through algorithms and statistical models such as the founding work by Jurafsky and Martin (2000) [14]. Within NLP, sentiment analysis applies techniques to assess pos

In [8]:
# Similarity Search with Scores

matched_docs = vector_store.similarity_search_with_relevance_scores(query)
matched_docs[0]

(Document(metadata={'source': 'PDF1(updated).md'}, page_content='After a review of the literature on NLP and Changes of Measure in section 2, a detailed overview of data and methodology is presented in section 3, which includes the developments from previous sentiment equation to more sophisticated ones along with enhanced NLP forecasting methods. Building upon these foundations, we introduce in section 4, a novel probability measure, that we name \' Hype-Adjusted Probability Measure", designed to capture the occurrence of market \'hype." The results and future work are discussed in section 5.\n\n## 2 Literature Review\n\nSentiment analysis, a subfield of Natural Language Processing (NLP), focuses on quantifying the emotional tone and intent conveyed in textual data. NLP itself is a rapidly growing area of machine learning (ML) that enables computers to process and understand human language through algorithms and statistical models such as the founding work by Jurafsky and Martin (2000

# Advanced search and retrieval techniques

1. **Maximal Marginal Relevance** is a re-ranking algorithm used to diversify search results, which is applied after the initial similarity search to ensure a more diverse set of results.
2. Some vector stores offer built-in **hybrid-search** to combine keyword and semantic similarity search, which marries the benefits of both approaches. At the moment, there is no unified way to perform hybrid search using LangChain vectorstores, but it is generally exposed as a keyword argument that is passed in with similarity_search.

Source: [Vector Stores LangChain](https://python.langchain.com/docs/concepts/vectorstores/)

In [9]:
query_embedding = embeddings.embed_query("Test query")
print(f"Query embedding size: {len(query_embedding)}")


Query embedding size: 384


In [10]:
# SupabaseVectorStore as a retriever using Maximal Marginal Relevance Searches

retriever = vector_store.as_retriever(search_type="mmr")
matched_docs = retriever.invoke(query)

for i, d in enumerate(matched_docs):
    print(f"\n## Document {i}\n")
    print(d.page_content)


## Document 0

After a review of the literature on NLP and Changes of Measure in section 2, a detailed overview of data and methodology is presented in section 3, which includes the developments from previous sentiment equation to more sophisticated ones along with enhanced NLP forecasting methods. Building upon these foundations, we introduce in section 4, a novel probability measure, that we name ' Hype-Adjusted Probability Measure", designed to capture the occurrence of market 'hype." The results and future work are discussed in section 5.

## 2 Literature Review

Sentiment analysis, a subfield of Natural Language Processing (NLP), focuses on quantifying the emotional tone and intent conveyed in textual data. NLP itself is a rapidly growing area of machine learning (ML) that enables computers to process and understand human language through algorithms and statistical models such as the founding work by Jurafsky and Martin (2000) [14]. Within NLP, sentiment analysis applies techniqu

In [11]:
# Display all similarity search with scores
from tabulate import tabulate

# Perform similarity search with scores
matched_docs = vector_store.similarity_search_with_relevance_scores(query)

# Prepare data for the table
table_data = []
for i, (doc, score) in enumerate(matched_docs):
    table_data.append([i + 1, f"{score:.4f}", doc.page_content])

# Print as a table
headers = ["Rank", "Similarity Score", "Page Content (Preview)"]
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+--------+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   Rank |   Similarity Score | Page Content (Preview)                                                                                                                                                                                                                                                                                                                                               

We've used **HuggingFace Sentence Transformer `all-MiniLM-L6-v2`**. Now let's try using other embedding models.


# **`all-mpnet-base-v2` by Sentence Transformer**

Both `all-MiniLM-L6-v2` and `all-mpnet-base-v2` are sentence embedding models from Hugging Face's Sentence Transformers library, but they differ in terms of model architecture, size, and performance.
* `all-MiniLM-L6-v2` is a lightweight model using a 6-layer MiniLM architecture, making it faster and more suitable for low-latency applications.
* `all-mpnet-base-v2` is based on MPNet, a larger and more powerful model that captures better semantic relationships. It achieves a higher **STS** (Semantic Text Similarity) score, meaning it produces more accurate and nuanced embeddings.




In [12]:
import os
import numpy
from supabase.client import Client, create_client
from langchain.vectorstores import SupabaseVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

# Change the embedding model to all-mpnet-base-v2
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownTextSplitter

# Load the Markdown file
markdown_path = "PDF1(updated).md"  # Replace with your file path
loader = TextLoader(markdown_path)
documents = loader.load()

# Split the document into chunks using MarkdownTextSplitter
text_splitter = MarkdownTextSplitter(chunk_size=2048, chunk_overlap=600)  # Experiment with chunk_size and chunk_overlap
docs = text_splitter.split_documents(documents)

print(f"Loaded {len(docs)} chunks")


Loaded 40 chunks


In [14]:
vector_store = SupabaseVectorStore.from_documents(
    docs,
    embeddings,
    client=supabase,
    table_name="documents_mpnet",
    query_name="match_documents_mpnet",
    chunk_size=500,
)


In [15]:
query = "What is Sentiment Analysis"
#query = "What is the accuracy of the LDA model when using the \"daily weighted average\" sentiment score?"
#query = "What is the characteristic of the sentiment score distribution?"

matched_docs = vector_store.similarity_search(query)
print(matched_docs[0].page_content)

After a review of the literature on NLP and Changes of Measure in section 2, a detailed overview of data and methodology is presented in section 3, which includes the developments from previous sentiment equation to more sophisticated ones along with enhanced NLP forecasting methods. Building upon these foundations, we introduce in section 4, a novel probability measure, that we name ' Hype-Adjusted Probability Measure", designed to capture the occurrence of market 'hype." The results and future work are discussed in section 5.

## 2 Literature Review

Sentiment analysis, a subfield of Natural Language Processing (NLP), focuses on quantifying the emotional tone and intent conveyed in textual data. NLP itself is a rapidly growing area of machine learning (ML) that enables computers to process and understand human language through algorithms and statistical models such as the founding work by Jurafsky and Martin (2000) [14]. Within NLP, sentiment analysis applies techniques to assess pos

In [16]:
# Check query embedding size
query_embedding = embeddings.embed_query("Test query")
print(f"Query embedding size: {len(query_embedding)}")

Query embedding size: 768


In [17]:
# Display all similarity search with scores
from tabulate import tabulate

# Perform similarity search with scores
matched_docs = vector_store.similarity_search_with_relevance_scores(query)

# Prepare data for the table
table_data = []
for i, (doc, score) in enumerate(matched_docs):
    table_data.append([i + 1, f"{score:.4f}", doc.page_content])

# Print as a table
headers = ["Rank", "Similarity Score", "Page Content (Preview)"]
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+--------+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   Rank |   Similarity Score | Page Content (Preview)                                                                                                                                                                                                                                                                                                                                               

In [None]:
# SupabaseVectorStore as a retriever using Maximal Marginal Relevance Searches
retriever = vector_store.as_retriever(search_type="mmr")
matched_docs = retriever.invoke(query)

for i, d in enumerate(matched_docs):
    print(f"\n## Document {i}\n")
    print(d.page_content)

# **E5 (Embedding-based Retrieval & Representation Learning) series by Intfloat**

* Optimized for search & retrieval (trained on large-scale datasets)
* Performs well in semantic search & ranking

\\
**Comparison of `e5-base` vs. `e5-large`**
* `e5-base` is faster and uses 768-dimensional embeddings.
* `e5-large` has better accuracy but is heavier with 1024 dimensions.
* Both require query prefixes like **"query: ..."**

In [24]:
import os
import numpy
from supabase.client import Client, create_client
from langchain.vectorstores import SupabaseVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

# Use the e5-large embedding model (1024 dimensions) or e5-base (768 dimensions)
embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-large")
#embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-base") # if you want to use this, you have to change the table dimension on SQL Editor


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [27]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownTextSplitter

# Load the Markdown file
markdown_path = "PDF1(updated).md"  # Replace with your file path
loader = TextLoader(markdown_path)
documents = loader.load()

# Split the document into chunks using MarkdownTextSplitter
text_splitter = MarkdownTextSplitter(chunk_size=2048, chunk_overlap=600)  # Experiment with chunk_size and chunk_overlap
docs = text_splitter.split_documents(documents)

print(f"Loaded {len(docs)} chunks")


Loaded 40 chunks


In [28]:
vector_store = SupabaseVectorStore.from_documents(
    docs,
    embeddings,
    client=supabase,
    table_name="documents_e5large",  # for e5-base use documents_e5base
    query_name="match_documents_e5large",  # for e5-base use match_documents_e5base
    chunk_size=500,
)

In [29]:
query = "query: What is Sentiment Analysis"
#query = "query: What is the accuracy of the LDA model when using the \"daily weighted average\" sentiment score?"
#query = "query: What is the characteristic of the sentiment score distribution?"

matched_docs = vector_store.similarity_search(query)
print(matched_docs[0].page_content)

After a review of the literature on NLP and Changes of Measure in section 2, a detailed overview of data and methodology is presented in section 3, which includes the developments from previous sentiment equation to more sophisticated ones along with enhanced NLP forecasting methods. Building upon these foundations, we introduce in section 4, a novel probability measure, that we name ' Hype-Adjusted Probability Measure", designed to capture the occurrence of market 'hype." The results and future work are discussed in section 5.

## 2 Literature Review

Sentiment analysis, a subfield of Natural Language Processing (NLP), focuses on quantifying the emotional tone and intent conveyed in textual data. NLP itself is a rapidly growing area of machine learning (ML) that enables computers to process and understand human language through algorithms and statistical models such as the founding work by Jurafsky and Martin (2000) [14]. Within NLP, sentiment analysis applies techniques to assess pos

In [30]:
# Check query embedding size
query_embedding = embeddings.embed_query("Test query")
print(f"Query embedding size: {len(query_embedding)}")

Query embedding size: 1024


In [31]:
# Display all similarity search with scores
from tabulate import tabulate

# Perform similarity search with scores
matched_docs = vector_store.similarity_search_with_relevance_scores(query)

# Prepare data for the table
table_data = []
for i, (doc, score) in enumerate(matched_docs):
    table_data.append([i + 1, f"{score:.4f}", doc.page_content])

# Print as a table
headers = ["Rank", "Similarity Score", "Page Content (Preview)"]
print("e5-large")
print(tabulate(table_data, headers=headers, tablefmt="grid"))


e5-large
+--------+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   Rank |   Similarity Score | Page Content (Preview)                                                                                                                                                                                                                                                                                                                                      

In [22]:
# Check query embedding size
query_embedding = embeddings.embed_query("Test query")
print(f"Query embedding size: {len(query_embedding)}")

Query embedding size: 768


In [23]:
# Display all similarity search with scores
from tabulate import tabulate

# Perform similarity search with scores
matched_docs = vector_store.similarity_search_with_relevance_scores(query)

# Prepare data for the table
table_data = []
for i, (doc, score) in enumerate(matched_docs):
    table_data.append([i + 1, f"{score:.4f}", doc.page_content])

# Print as a table
headers = ["Rank", "Similarity Score", "Page Content (Preview)"]
print("e5-base")
print(tabulate(table_data, headers=headers, tablefmt="grid"))


e5-base
+--------+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   Rank |   Similarity Score | Page Content (Preview)                                                   

In [None]:
# SupabaseVectorStore as a retriever using Maximal Marginal Relevance Searches
retriever = vector_store.as_retriever(search_type="mmr")
matched_docs = retriever.invoke(query)

# **BAAI General Embedding (BGE) series developed by BAAI (Beijing Academy of Artificial Intelligence)**
* Optimized for retrieval & ranking tasks
* Can be used for RAG, search, and clustering
* Supports **multilingual** embeddings (some versions), which means:

    * It can process and understand text in different languages.
    * It generates language-agnostic embeddings, meaning similar sentences in English, Chinese, French, etc. will have similar vector representations.
    * It allows cross-lingual retrieval—you can search in one language and retrieve documents in another!
* For multilingual support, use `bge-m3` with 1024-dimensional embedding size.

In [37]:
import os
import numpy
from supabase.client import Client, create_client
from langchain.vectorstores import SupabaseVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings

huggingfacehub_key = os.environ.get("HUGGINGFACEHUB_API_KEY")
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

# Use the bge-m3 embedding model (1024 dimensions)
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")


In [45]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownTextSplitter

# Load the Markdown file
markdown_path = "Manuale-IRIS_SLIM_IN_TEC_IT(Updated).md"  # Replace with your file path
loader = TextLoader(markdown_path)
documents = loader.load()

# Split the document into chunks using MarkdownTextSplitter
text_splitter = MarkdownTextSplitter(chunk_size=1024, chunk_overlap=300)  # Experiment with chunk_size and chunk_overlap
docs = text_splitter.split_documents(documents)

print(f"Loaded {len(docs)} chunks")


Loaded 133 chunks


In [46]:
vector_store = SupabaseVectorStore.from_documents(
    docs,
    embeddings,
    client=supabase,
    table_name="documents_bgem3",  # for e5-base use documents_e5base
    query_name="match_documents_bgem3",  # for e5-base use match_documents_e5base
    chunk_size=500,
)

In [47]:
query = "What are the key considerations for using and maintaining the Iris Slim units?"  # this question could be answer from different paragraphs (Section 2 and Section 6.1)

matched_docs = vector_store.similarity_search(query)
print(matched_docs[0].page_content)

Il Servizio assistenza clienti è comunque a disposizione per fornire, dietro richiesta, informazioni sugli aggiornamenti che Rossato Group Srl ha apportato alle macchine.

## 2. Presentazione

## 2.1. Uso Previsto

Le  unità  Iris  Slim  sono  progettate  per  la  funzione  di  riscaldamento,  raffrescamento,  deumidificazione  e  filtrazione  di ambienti residenziali e terziario (uffici, locali pubblici, o simili).

## 2.2. Usi Non Previsti E Controindicazioni

Non sono ammesse le seguenti applicazioni:

- · Funzionamento all'aperto
- · Funzionamento in ambienti umidi o esplosivi o polverosi
- · Funzionamento in ambienti corrosivi, in particolare per le alette d'alluminio della batteria
- · Funzionamento in ambienti sottoposti a disturbi elettromagnetici


In [48]:
# Check query embedding size
query_embedding = embeddings.embed_query("Test query")
print(f"Query embedding size: {len(query_embedding)}")

Query embedding size: 1024


In [49]:
# Display all similarity search with scores
from tabulate import tabulate

# Perform similarity search with scores
matched_docs = vector_store.similarity_search_with_relevance_scores(query)

# Prepare data for the table
table_data = []
for i, (doc, score) in enumerate(matched_docs):
    table_data.append([i + 1, f"{score:.4f}", doc.page_content])

# Print as a table
headers = ["Rank", "Similarity Score", "Page Content (Preview)"]
print("bge-m3")
print(tabulate(table_data, headers=headers, tablefmt="grid"))


bge-m3
+--------+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   Rank |   Similarity Score | Page Content (Preview)                                                                                                                                                                                                                                                                                                                                                                                                                 |
|      1 |             0.6685 | Il Servizio assistenza clienti è comunque a dis

In [51]:
# SupabaseVectorStore as a retriever using Maximal Marginal Relevance Searches

retriever = vector_store.as_retriever(search_type="mmr")
matched_docs = retriever.invoke(query)

for i, d in enumerate(matched_docs):
    print(f"\n## Document {i}\n")
    print(d.page_content)


## Document 0

Il Servizio assistenza clienti è comunque a disposizione per fornire, dietro richiesta, informazioni sugli aggiornamenti che Rossato Group Srl ha apportato alle macchine.

## 2. Presentazione

## 2.1. Uso Previsto

Le  unità  Iris  Slim  sono  progettate  per  la  funzione  di  riscaldamento,  raffrescamento,  deumidificazione  e  filtrazione  di ambienti residenziali e terziario (uffici, locali pubblici, o simili).

## 2.2. Usi Non Previsti E Controindicazioni

Non sono ammesse le seguenti applicazioni:

- · Funzionamento all'aperto
- · Funzionamento in ambienti umidi o esplosivi o polverosi
- · Funzionamento in ambienti corrosivi, in particolare per le alette d'alluminio della batteria
- · Funzionamento in ambienti sottoposti a disturbi elettromagnetici

## Document 1

Al  termine  dell'installazione,  sarà  necessario  rimontare  tutti  i  componenti  seguendo  l'ordine  inverso  con  cui  sono  stati smontati. E' assolutamente vietato (gravi rischi di danni a person