In [None]:
import os
import pdfplumber
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import psycopg2
from psycopg2.extras import execute_values

In [None]:


# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Connect to PostgreSQL
conn = psycopg2.connect(
    dbname="your_dbname",
    user="your_username",
    password="your_password",
    host="your_host",
    port="your_port"
)

# Create a table to store embeddings with metadata
with conn.cursor() as cursor:
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS embeddings (
        id SERIAL PRIMARY KEY,
        document_name TEXT,
        page_number INTEGER,
        text TEXT,
        embedding vector(384)  -- Adjust the dimension based on your model
    )
    """)
    conn.commit()

# Assuming PATH_NOTES, chemin_notes, nom_notes, and numero_notes are defined
prepro_dir = 'preprocessing'
os.makedirs(os.path.join(PATH_NOTES, prepro_dir), exist_ok=True)
texte_dir = 'texte'

for chemin, nom, numero in tqdm(zip(chemin_notes, nom_notes, numero_notes), desc="Processing Notes"):
    pdf = pdfplumber.open(chemin)
    note_dir = f"{nom.split('.')[0]}"
    page_dir = os.path.join(PATH_NOTES, prepro_dir, note_dir, texte_dir)
    os.makedirs(page_dir, exist_ok=True)

    for idx, page in tqdm(enumerate(pdf.pages), desc="Processing Pages", leave=False):
        text = page.extract_text()
        text_clean = remove_occurences_3(text)

        # Generate embeddings
        embedding = model.encode(text_clean)

        # Insert embeddings and metadata into the table
        with conn.cursor() as cursor:
            execute_values(
                cursor,
                "INSERT INTO embeddings (document_name, page_number, text, embedding) VALUES %s",
                [(nom, idx + 1, text_clean, embedding.tolist())]
            )
            conn.commit()

        # Optionally, save the cleaned text to a file
        with open(f"{page_dir}/page_{idx+1}.txt", "w", encoding='utf-8') as f:
            f.write(text_clean)

# Close the connection
conn.close()
