# Imports

In [2]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from supabase import create_client
from langchain.document_loaders import PyPDFLoader

# Adds Embeddings in the DB
only a certain number of pages are entered

In [3]:
class InsertEmbeddings:
    def __init__(self, data, model, client):
        self.data = data
        self.loader = PyPDFLoader
        self.model = model
        self.client = client
        self.page_limit = 20
    
    def clean_db(self):
        try:
            self.client.table('documents').delete().neq('id', 0).execute()
            print("Database cleaned")
            return True
        except Exception as e:
            print(f"Database not cleaned: {str(e)}")
            return False
    
    def read_data(self):
        pages = []
        if 'doc_link' not in self.data.columns:
            return pages
            
        for links in self.data.doc_link.to_list():
            if not isinstance(links, str):
                continue
                
            try:
                loader = self.loader(links)
                loaded_pages = loader.load()
                pages_to_add = loaded_pages[:min(5, len(loaded_pages))]
                pages.extend(pages_to_add)
                
                if len(pages) >= self.page_limit:
                    break
                    
            except Exception as e:
                print(f"Error loading document: {str(e)}")
        
        return pages[:self.page_limit]
    
    def add_data_in_db(self):
        self.clean_db()
        
        pages = self.read_data()
        if not pages:
            print("No pages to add to the database")
            return False
            
        page_texts = [page.page_content for page in pages]
        encodings = self.model.encode(page_texts)
        
        batch_data = []
        for text, embedding in zip(page_texts, encodings):
            data = {
                'content': text,
                'embedding': embedding.tolist()
            }
            batch_data.append(data)
            
        try:
            self.client.table('documents').insert(batch_data).execute()
            print(f"Successfully added {len(batch_data)} documents to the database")
            return True
        except Exception as e:
            print(f"Error inserting batch data: {str(e)}")
            success_count = 0
            for data in batch_data:
                try:
                    self.client.table('documents').insert(data).execute()
                    success_count += 1
                except Exception as e:
                    print(f"Error inserting individual document: {str(e)}")
            
            print(f"Successfully added {success_count} out of {len(batch_data)} documents")
            return success_count > 0


# Normal RAG
based on this https://supabase.com/docs/guides/ai/semantic-search

In [8]:
class NormalRag:
    def __init__(self, question, model, client, answer):
        self.model = model
        self.question = question
        self.client = client
        self.answer = answer
    
    def do_rag(self):
        q_em = self.model.encode(self.question).tolist()
        a_em = self.model.encode(self.answer)

        match_threshold = 0.7
        match_count = 1

        response = self.client.rpc('match_documents', {
            'query_embedding': q_em,
            'match_threshold': match_threshold,
            'match_count': match_count
        }).execute()

        if not response.data:
            print("No matching documents found.")
            return []

        matched_results = []
        
        for result in response.data:
            content = result["content"]
            content_em = np.array(result["embedding"])
            cosine_similarity = 1 - result["cosine_distance"]
            matched_results.append({"content": content, "cosine_similarity": cosine_similarity})

            print(f"Cosine Similarity with Answer: {cosine_similarity}")
            print(f"Content: {content}")
            print("-" * 50)

        return matched_results


# Hybrid approch
based on https://supabase.com/docs/guides/ai/hybrid-search

In [9]:
class HybridRag:
    def __init__(self, question, model, client, answer):
        self.model = model
        self.client = client
        self.question = question
        self.answer = answer

    def hybrid_rag(self):
        q_em = self.model.encode(self.question).tolist()
        a_em = self.model.encode(self.answer)

        match_count = 1

        response = self.client.rpc('hybrid_search', {
            'query_text': self.question,
            'query_embedding': q_em,
            'match_count': match_count
        }).execute()

        if not response.data:
            print("No matching documents found.")
            return []

        matched_results = []

        for result in response.data:
            content = result["content"]
            content_em = self.model.encode(content)
            cosine_distance = 1 - np.dot(content_em, a_em) / (np.linalg.norm(content_em) * np.linalg.norm(a_em))
            cosine_similarity = 1 - cosine_distance
            matched_results.append({"content": content, "cosine_similarity": cosine_similarity})

        matched_results = sorted(matched_results, key=lambda x: x["cosine_similarity"], reverse=True)

        for result in matched_results:
            print(f"Cosine Similarity with Answer: {result['cosine_similarity']}")
            print(f"Content: {result['content']}")
            print("-" * 50)

        return matched_results


# Final run
ran it for a sample size of 5

In [10]:
df = pd.read_json("hf://datasets/PatronusAI/financebench/financebench_merged.jsonl", lines=True)
sample_size = 2
col_to_work_with = ["question", "answer", "doc_link"]
working_data = df.head(sample_size)[col_to_work_with]

supabaseUrl = your_api 
supabaseKey = key
supabase = create_client(supabaseUrl, supabaseKey)
print('Supabase connection established')
model = SentenceTransformer('all-MiniLM-L6-v2')

insert = InsertEmbeddings(working_data, model, supabase)
insert.add_data_in_db()
normal_rag_results = []
hybrid_rag_results = []
normal_cosine_sims = [] 
hybrid_cosine_sims = []  

for i in range(len(working_data)):
    sample_question = working_data.iloc[i]["question"]
    sample_answer = working_data.iloc[i]["answer"]
    
    # Normal RAG
    normal_rag = NormalRag(sample_question, model, supabase, sample_answer)
    normal_result = normal_rag.do_rag()
    normal_rag_results.append(normal_result)
    if normal_result:
        avg_cosine = sum(r["cosine_similarity"] for r in normal_result) / len(normal_result)
        normal_cosine_sims.append(avg_cosine)
        print(f"Sample {i+1} - Normal RAG returned {len(normal_result)} results with avg cosine similarity: {avg_cosine:.4f}")
    else:
        print(f"Sample {i+1} - Normal RAG returned 0 results")
    
    # Hybrid RAG
    hybrid_rag = HybridRag(sample_question, model, supabase, sample_answer)
    hybrid_result = hybrid_rag.hybrid_rag()
    hybrid_rag_results.append(hybrid_result)
    
    # Calculate average cosine similarity for this sample's hybrid RAG results
    if hybrid_result:
        avg_cosine = sum(r["cosine_similarity"] for r in hybrid_result) / len(hybrid_result)
        hybrid_cosine_sims.append(avg_cosine)
        print(f"Sample {i+1} - Hybrid RAG returned {len(hybrid_result)} results with avg cosine similarity: {avg_cosine:.4f}")
    else:
        print(f"Sample {i+1} - Hybrid RAG returned 0 results")

# Summary
print("\n===== SUMMARY =====")
print(f"Processed {len(working_data)} samples")

# Document count statistics
print("\n== Document Retrieval Statistics ==")
print(f"Average Normal RAG documents retrieved: {sum(len(r) for r in normal_rag_results) / len(normal_rag_results):.2f}")
print(f"Average Hybrid RAG documents retrieved: {sum(len(r) for r in hybrid_rag_results) / len(hybrid_rag_results):.2f}")

print("\n== Cosine Similarity Statistics ==")
if normal_cosine_sims:
    print(f"Average Normal RAG cosine similarity: {sum(normal_cosine_sims) / len(normal_cosine_sims):.4f}")
else:
    print("Normal RAG did not retrieve any documents")

if hybrid_cosine_sims:
    print(f"Average Hybrid RAG cosine similarity: {sum(hybrid_cosine_sims) / len(hybrid_cosine_sims):.4f}")
else:
    print("Hybrid RAG did not retrieve any documents")

if normal_cosine_sims:
    print(f"Max Normal RAG cosine similarity: {max(normal_cosine_sims):.4f}")
if hybrid_cosine_sims:
    print(f"Max Hybrid RAG cosine similarity: {max(hybrid_cosine_sims):.4f}")


Supabase connection established
Database cleaned


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Successfully added 10 documents to the database


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No matching documents found.
Sample 1 - Normal RAG returned 0 results


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with Answer: 0.1215265616774559
Content: Table of Contents
 
3M COMPANY
ANNUAL REPORT ON FORM 10-K
For the Year Ended December 31, 2018
PART I
 
 
Item 1. 
Busines
 
s.
 
3M Company was incorporated in 1929 under the laws of the State of Delaware to continue operations begun in 1902. The Company’s ticker symbol is
MMM. As used herein, the term “3M” or “Company” includes 3M Company and its subsidiaries unless the context indicates otherwise. In this document,
for any references to Note 1 through Note 20, refer to the Notes to Consolidated Financial Statements in Item 8.
 
Available Information
 
The SEC maintains a website that contains reports, proxy and information statements, and other information regarding issuers, including the Company, that
file electronically with the SEC. The public can obtain any documents that the Company files with the SEC at http://www.sec.gov. The Company files
annual reports, quarterly reports, proxy statements and other documents with th

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No matching documents found.
Sample 2 - Normal RAG returned 0 results


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with Answer: 0.040300097316503525
Content: Table of Contents
 
3M COMPANY
ANNUAL REPORT ON FORM 10-K
For the Year Ended December 31, 2018
PART I
 
 
Item 1. 
Busines
 
s.
 
3M Company was incorporated in 1929 under the laws of the State of Delaware to continue operations begun in 1902. The Company’s ticker symbol is
MMM. As used herein, the term “3M” or “Company” includes 3M Company and its subsidiaries unless the context indicates otherwise. In this document,
for any references to Note 1 through Note 20, refer to the Notes to Consolidated Financial Statements in Item 8.
 
Available Information
 
The SEC maintains a website that contains reports, proxy and information statements, and other information regarding issuers, including the Company, that
file electronically with the SEC. The public can obtain any documents that the Company files with the SEC at http://www.sec.gov. The Company files
annual reports, quarterly reports, proxy statements and other documents with 