In [1]:
# Loading environment variables and initializing Supabase client and SentenceTransformer model
import os
import json
from dotenv import load_dotenv
from supabase.client import Client, create_client
from sentence_transformers import SentenceTransformer
from utils import load_config

load_dotenv()

config = load_config()
data = config["data"]

supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_SERVICE_KEY")

supabase: Client = create_client(supabase_url, supabase_key)
embeddings = SentenceTransformer(model_name_or_path=config["vector_store"]["embedding_model_name"], cache_folder=config["models"]["cache_folder"])

In [2]:
# Reading JSONL file and creating documents with embeddings
with open(data, 'r') as jsonl_file:
    json_list = list(jsonl_file)

documents = []
for json_str in json_list:
    json_data = json.loads(json_str)
    content = f"{json_data['Question']}"
    embedding = embeddings.encode(content, normalize_embeddings=True).tolist()
    document = {
        "content": content,
        "metadata": {
            "source": "vector_search",
            "task_id": json_data['task_id']
        },
        "embedding": embedding,
    }
    documents.append(document)



In [3]:
# Inserting documents into Supabase

# Note1: pgvector needs to be enabled, to turn to vector database
# Note2: Table needs to be created beforehand in Supabase, with column types
try:
    response = (
        supabase.table("gaia_documents")
        .insert(documents)
        .execute()
    )
except Exception as exception:
    print("Error inserting data into Supabase:", exception)