<a href="https://colab.research.google.com/github/melrahmtz/purple-box/blob/main/hands-on-practice/2802_hybrid_search_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install langchain --quiet
!pip install supabase --quiet
!pip install tiktoken --quiet
!pip install unstructured --quiet
!pip install numpy --quiet
!pip install transformers --quiet
!pip install -U langchain-community  --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.1/41.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.6/167.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.5/166.5 kB[0m [31m10.8 MB/s[0m eta [3

In [2]:
import getpass
import os

# Set API keys
if "HUGGINGFACEHUB_API_KEY" not in os.environ:
    os.environ["HUGGINGFACEHUB_API_KEY"] = getpass.getpass("HuggingFace API Key:")

if "SUPABASE_URL" not in os.environ:
    os.environ["SUPABASE_URL"] = getpass.getpass("Supabase URL:")

if "SUPABASE_SERVICE_KEY" not in os.environ:
    os.environ["SUPABASE_SERVICE_KEY"] = getpass.getpass("Supabase Service Key:")


HuggingFace API Key:··········
Supabase URL:··········
Supabase Service Key:··········


# **Trial 1**
Embedding model: `all-MiniLM-L6-v2 (384)`


In [22]:
import os
import json
import numpy as np
import pandas as pd
from supabase.client import Client, create_client
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize Supabase Client
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

# Use MiniLM embedding model (384 dimensions)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def generate_embedding(text):
    """Generates a 384-dimensional embedding for the input text."""
    return embeddings.embed_query(text)

def load_json_documents(json_path):
    """Loads a JSON file and extracts chunked data."""
    with open(json_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def store_documents_in_supabase(json_path):
    """Stores pre-chunked documents in Supabase without additional chunking."""
    documents = load_json_documents(json_path)

    for node in documents:
        content = node.get("content", "").strip()
        table_data = node.get("table", None)  # Table data if available
        metadata = node.get("metadata", {})

        if not content and not table_data:
            continue  # Skip empty chunks

        embedding = generate_embedding(content) if content else None

        data = {
            "content": content if content else None,
            "embedding": embedding if embedding else None,
            "table_data": json.dumps(table_data) if table_data else None,  # Store tables as JSON
            "source": metadata.get("source", "unknown"),
            "section": metadata.get("section", "unknown"),
            "chunk_position": metadata.get("position", -1)
        }

        supabase.table("documents_hybrid_search_v2").insert(data).execute()

    print(f"Documents from {json_path} successfully stored in Supabase.")


In [28]:
# Example: Store JSON data
store_documents_in_supabase("17_chunks_v2.json")


Documents from 17_chunks_v2.json successfully stored in Supabase.


In [29]:
def hybrid_search(query, match_count=10):
    """Calls the hybrid_search function in Supabase with MiniLM embeddings."""
    query_embedding = generate_embedding(query)

    # Call Supabase RPC function
    response = supabase.rpc("hybrid_search_v2", {
        "query_text": query,
        "query_embedding": query_embedding,
        "match_count": match_count
    }).execute()

    return response.data

# Example query for PDF1
#query = "What is sentiment analysis?"  # Answer in Section 2
#query = "What is the accuracy of the LDA model when using the \"daily weighted average\" sentiment score?"  # Answer in Section 3.1
#query = "What is the characteristic of the sentiment score distribution?"  # Answer in Figure 1

# Example query for 17
query = "What were the objectives of the workshop on family medicine training in Africa?" # Answer in Introduction (page 1)
#query = "Challenges faced in family medicine training in Africa"  # Answer in Introduction (page 2)
#query = "Which countries participated in the workshop on family medicine training in Africa?"  # Answer in Workshop Participant (page 3)
#query = "What factors influence the effectiveness of family medicine training in Africa?" # Answer can be found in several sections

results = hybrid_search(query, match_count=10)

# Display results
for doc in results:
    print(doc)


{'id': 86, 'fts_score': 0.0163934426229508, 'semantic_score': 0.0153846153846154, 'final_score': 0.0317780580075662, 'content': '## Corresponding author:  \nLouis Jenkins, louis.jenkins@westerncape. gov.za\n## Dates:  \nReceived: 28 Sept. 2017  \nAccepted: 09 Nov. 2017  \nPublished: 12 Apr. 2018  \nHow to cite this article: Jenkins LS, Von Pressentin K. Family medicine training in Africa: Views of clinical trainers and trainees. Afr J Prm Health Care Fam Med. 2018;10(1), a1638. https:// doi.org/10.4102/phcfm. v10i1.1638\n## Copyright:  \n© 2018. The Authors. Licensee: AOSIS. This work is licensed under the Creative Commons Attribution License.  \n![Image](/content/markdown/17_artifacts/image_000005_e2cece3be96aa05931eea2488c7312b12d82969056fd50762e3f32ae19090fd2.png)  \n*Image Description:* The image shows a QR code with instructions instructing viewers to "Scan this QR code with their smartphone or mobile device to read online."  \nObjectives :  The  aim  of  the  workshop  was  to  u

In [35]:
def hybrid_search_display_as_table(query, match_count=10):
    """Calls the hybrid_search function in Supabase and returns results as a DataFrame."""
    query_embedding = generate_embedding(query)

    # Call Supabase RPC function
    response = supabase.rpc("hybrid_search_v2", {
        "query_text": query,
        "query_embedding": query_embedding,
        "match_count": match_count
    }).execute()

    # Convert results to DataFrame
    if response.data:
        df = pd.DataFrame(response.data, columns=["id", "fts_score", "semantic_score", "final_score", "content", "table_data", "source", "section", "chunk_position"])
        return df
    else:
        print("No results found.")
        return None

# Example query for PDF1
#query = "What is sentiment analysis?"  # Answer in Section 2
#query = "What is the accuracy of the LDA model when using the \"daily weighted average\" sentiment score?"  # Answer in Section 3.1
#query = "What is the characteristic of the sentiment score distribution?"  # Answer in Figure 1

# Example query for 17
#query = "What were the objectives of the workshop on family medicine training in Africa?" # Answer in Introduction (page 1)
#query = "Challenges faced in family medicine training in Africa"  # Answer in Introduction (page 2) or Conclusion
#query = "Which countries participated in the workshop on family medicine training in Africa?"  # Answer in Table Workshop Participant (page 3)
query = "What factors influence the effectiveness of family medicine training in Africa?" # Answer can be found in several sections


df_results = hybrid_search_display_as_table(query, match_count=10)

# Display Table
if df_results is not None:
    from IPython.display import display
    display(df_results)


Unnamed: 0,id,fts_score,semantic_score,final_score,content,table_data,source,section,chunk_position
0,99,0,0.016393,0.016393,## Competing interests \nThe authors declare ...,,17.md,Competing interests,16
1,98,0,0.016129,0.016129,## Conclusion \nIt was clear from this worksh...,,17.md,Conclusion,15
2,87,0,0.015873,0.015873,7 \nThe World Organisation of Family Doctors ...,,17.md,Unknown,4
3,97,0,0.015625,0.015625,## Reflection on workshop \nThe group appreci...,,17.md,Reflection on workshop,14
4,90,0,0.015385,0.015385,## Context is critical \nThe physical place w...,,17.md,Context is critical,7
5,86,0,0.015152,0.015152,"## Corresponding author: \nLouis Jenkins, lou...",,17.md,Corresponding author:,3
6,91,0,0.014925,0.014925,## Learning style of the registrar and (teachi...,,17.md,Learning style of the registrar and (teaching ...,8
7,92,0,0.014706,0.014706,## Learning portfolio is utilised \nThe grou...,,17.md,Learning portfolio is utilised,9
8,85,0,0.014493,0.014493,## Family medicine training in Africa: Views o...,,17.md,Family medicine training in Africa: Views of c...,2
9,88,0,0.014286,0.014286,## Participants and process \nThirty-five peo...,,17.md,Participants and process,5


# **Trial 2**
Embedding model: `bge-m3` (1024)

In [36]:
import os
import json
import numpy as np
import pandas as pd
from supabase.client import Client, create_client
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize Supabase Client
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

# Use bge-m3 multilingual embedding model (1024 dimensions)
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

def generate_embedding(text):
    """Generates a 384-dimensional embedding for the input text."""
    return embeddings.embed_query(text)

def load_json_documents(json_path):
    """Loads a JSON file and extracts chunked data."""
    with open(json_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def store_documents_in_supabase(json_path):
    """Stores pre-chunked documents in Supabase without additional chunking."""
    documents = load_json_documents(json_path)

    for node in documents:
        content = node.get("content", "").strip()
        table_data = node.get("table", None)  # Table data if available
        metadata = node.get("metadata", {})

        if not content and not table_data:
            continue  # Skip empty chunks

        embedding = generate_embedding(content) if content else None

        data = {
            "content": content if content else None,
            "embedding": embedding if embedding else None,
            "table_data": json.dumps(table_data) if table_data else None,  # Store tables as JSON
            "source": metadata.get("source", "unknown"),
            "section": metadata.get("section", "unknown"),
            "chunk_position": metadata.get("position", -1)
        }

        supabase.table("documents_hybrid_search_v2_bge").insert(data).execute()

    print(f"Documents from {json_path} successfully stored in Supabase.")


In [37]:
# Example: Store JSON data
store_documents_in_supabase("17_chunks_v2.json")

Documents from 17_chunks_v2.json successfully stored in Supabase.


In [38]:
def hybrid_search(query, match_count=10):
    """Calls the hybrid_search function in Supabase with bge-m3 embeddings."""
    query_embedding = generate_embedding(query)

    # Call Supabase RPC function
    response = supabase.rpc("hybrid_search_v2_bge", {
        "query_text": query,
        "query_embedding": query_embedding,
        "match_count": match_count
    }).execute()

    return response.data

# Example query for PDF1
#query = "What is sentiment analysis?"  # Answer in Section 2
#query = "What is the accuracy of the LDA model when using the \"daily weighted average\" sentiment score?"  # Answer in Section 3.1
#query = "What is the characteristic of the sentiment score distribution?"  # Answer in Figure 1

# Example query for 17
query = "What were the objectives of the workshop on family medicine training in Africa?" # Answer in Introduction (page 1)
#query = "Challenges faced in family medicine training in Africa"  # Answer in Introduction (page 2)
#query = "Which countries participated in the workshop on family medicine training in Africa?"  # Answer in Workshop Participant (page 3)
#query = "What factors influence the effectiveness of family medicine training in Africa?" # Answer can be found in several sections

results = hybrid_search(query, match_count=10)

# Display results
for doc in results:
    print(doc)


{'id': 26, 'fts_score': 0.0163934426229508, 'semantic_score': 0.0158730158730159, 'final_score': 0.0322664584959667, 'content': '## Corresponding author:  \nLouis Jenkins, louis.jenkins@westerncape. gov.za\n## Dates:  \nReceived: 28 Sept. 2017  \nAccepted: 09 Nov. 2017  \nPublished: 12 Apr. 2018  \nHow to cite this article: Jenkins LS, Von Pressentin K. Family medicine training in Africa: Views of clinical trainers and trainees. Afr J Prm Health Care Fam Med. 2018;10(1), a1638. https:// doi.org/10.4102/phcfm. v10i1.1638\n## Copyright:  \n© 2018. The Authors. Licensee: AOSIS. This work is licensed under the Creative Commons Attribution License.  \n![Image](/content/markdown/17_artifacts/image_000005_e2cece3be96aa05931eea2488c7312b12d82969056fd50762e3f32ae19090fd2.png)  \n*Image Description:* The image shows a QR code with instructions instructing viewers to "Scan this QR code with their smartphone or mobile device to read online."  \nObjectives :  The  aim  of  the  workshop  was  to  u

In [42]:
def hybrid_search_display_as_table(query, match_count=10):
    """Calls the hybrid_search function in Supabase and returns results as a DataFrame."""
    query_embedding = generate_embedding(query)

    # Call Supabase RPC function
    response = supabase.rpc("hybrid_search_v2_bge", {
        "query_text": query,
        "query_embedding": query_embedding,
        "match_count": match_count
    }).execute()

    # Convert results to DataFrame
    if response.data:
        df = pd.DataFrame(response.data, columns=["id", "fts_score", "semantic_score", "final_score", "content", "table_data", "source", "section", "chunk_position"])
        return df
    else:
        print("No results found.")
        return None

# Example query for PDF1
#query = "What is sentiment analysis?"  # Answer in Section 2
#query = "What is the accuracy of the LDA model when using the \"daily weighted average\" sentiment score?"  # Answer in Section 3.1
#query = "What is the characteristic of the sentiment score distribution?"  # Answer in Figure 1

# Example query for 17
#query = "What were the objectives of the workshop on family medicine training in Africa?" # Answer in Introduction (page 1)
#query = "Challenges faced in family medicine training in Africa"  # Answer in Introduction (page 2)
#query = "Which countries participated in the workshop on family medicine training in Africa?"  # Answer in Workshop Participant (page 3)
query = "What factors influence the effectiveness of family medicine training in Africa?" # Answer can be found in several sections

df_results = hybrid_search_display_as_table(query, match_count=10)

# Display Table
if df_results is not None:
    from IPython.display import display
    display(df_results)


Unnamed: 0,id,fts_score,semantic_score,final_score,content,table_data,source,section,chunk_position
0,38,0,0.016393,0.016393,## Conclusion \nIt was clear from this worksh...,,17.md,Conclusion,15
1,26,0,0.016129,0.016129,"## Corresponding author: \nLouis Jenkins, lou...",,17.md,Corresponding author:,3
2,25,0,0.015873,0.015873,## Family medicine training in Africa: Views o...,,17.md,Family medicine training in Africa: Views of c...,2
3,39,0,0.015625,0.015625,## Competing interests \nThe authors declare ...,,17.md,Competing interests,16
4,27,0,0.015385,0.015385,7 \nThe World Organisation of Family Doctors ...,,17.md,Unknown,4
5,30,0,0.015152,0.015152,## Context is critical \nThe physical place w...,,17.md,Context is critical,7
6,31,0,0.014925,0.014925,## Learning style of the registrar and (teachi...,,17.md,Learning style of the registrar and (teaching ...,8
7,37,0,0.014706,0.014706,## Reflection on workshop \nThe group appreci...,,17.md,Reflection on workshop,14
8,28,0,0.014493,0.014493,## Participants and process \nThirty-five peo...,,17.md,Participants and process,5
9,36,0,0.014286,0.014286,## The competence of the supervisor \nThe cl...,,17.md,The competence of the supervisor,13
