the script to create the vector database using pinecone

In [60]:
from pinecone import Pinecone, ServerlessSpec
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm import tqdm
import os
import re
from bs4 import BeautifulSoup

In [47]:
from dotenv import load_dotenv
load_dotenv()
# Configuration Pinecone
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', "")

In [52]:
index_name = "ml2"
pc = Pinecone(api_key= PINECONE_API_KEY)

pc.create_index(
    name=index_name,
    dimension=768, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)
print(f"Index {index_name} created successfully.")

Index ml2 created successfully.


In [50]:
def clean_text(md_content):
    """Clean Markdown content by removing HTML tags and irrelevant metadata."""
    # Remove HTML tags
    text = BeautifulSoup(md_content, "html.parser").get_text()
    
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Optional: Remove metadata lines (e.g., "État: VIGUEUR")
    text = re.sub(r'---.*?---', '', text, flags=re.DOTALL)
    
    return text

def read_markdown_files(root_folder):
    md_files = {}

    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(".md") and filename.lower() != "readme.md":
                file_path = os.path.join(dirpath, filename)
                try:
                    with open(file_path, "r", encoding="utf-8") as file:
                        # Clean the file content before storing
                        md_files[file_path] = clean_text(file.read())
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    return md_files

# Usage example
root_directory = "./"
markdown_data = read_markdown_files(root_directory)

# Print summary
print(f"Found {len(markdown_data)} markdown files.")

Found 2891 markdown files.


In [53]:
# Function to print out the cleaned data from markdown files
def print_cleaned_markdown_data(cleaned_data, num_files=5):
    print(f"Displaying the first {num_files} cleaned markdown files:")
    
    count = 0
    for file_path, text in cleaned_data.items():
        if count >= num_files:
            break
        print(f"\nFile: {file_path}\n{'-'*40}")
        print(text[:1000])  # Print the first 1000 characters to avoid too much output
        print("\n" + "="*40)
        count += 1

# Usage example
print_cleaned_markdown_data(markdown_data)

Displaying the first 5 cleaned markdown files:

File: ./LICENCE.md
----------------------------------------
# Textes juridiques consolidés français sous Git **Avertissement** : Ce projet est en cours de développement. **Il peut contenir des erreurs** ! En cas de doute, nous vous invitons à vous référer au site [Légifrance](https://www.legifrance.gouv.fr/). ## Licence Ce dépôt est constitué d'éléments provenant du projet [Tricoteuses](https://git.tricoteuses.fr/) et de données ouvertes (Open Data) mises à disposition sur le site Légifrance. ### Conditions de réutilisation des données originales du site Légifrance Les données originales sont produites par la [Direction de l'information légale et administrative (Dila)](https://dila.premier-ministre.gouv.fr/). Elles sont réutilisables gratuitement sous [licence ouverte v2.0](https://www.etalab.gouv.fr/licence-ouverte-open-licence/). Les réutilisateurs s'obligent à mentionner : - la paternité des données (DILA) ; - les URL d'accès longues d

In [54]:
# Step 1: Load a French legal model
model_name = "camembert-base"  # Suitable model for French legal documents
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Pinecone setup
index = pc.Index("ml2")

# Step 2: Chunk text with overlap
def chunk_text(text, chunk_size=500, overlap_size=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap_size):
        chunks.append(" ".join(words[i:i + chunk_size]))
    return chunks

# Step 3: Get embeddings using the French model
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # Use [CLS] token

# Step 4: Insert documents into Pinecone with improved chunking and embeddings
def insert_documents():
    md_files = markdown_data  # Assuming markdown_data is already loaded
    vectors = []

    for file_path, content in tqdm(md_files.items(), desc="Processing Markdown Files"):
        chunks = chunk_text(content)  # Create chunks with overlap
        for i, chunk in enumerate(chunks):
            embedding = get_embedding(chunk)
            vector_id = f"{file_path}-{i}"  # Unique ID per chunk
            vectors.append({
                "id": vector_id, 
                "values": embedding.tolist(),
                "metadata": {"source": file_path}
            })

            # Upsert in batches (Pinecone recommends batching)
            if len(vectors) >= 100:
                index.upsert(vectors)
                vectors = []

    if vectors:
        index.upsert(vectors)  # Insert remaining vectors

# Call the function to insert documents
insert_documents()

Processing Markdown Files: 100%|██████████| 2891/2891 [42:53<00:00,  1.12it/s]  


In [57]:
def query_pinecone(query, top_k=5):
    query_embedding = get_embedding(query)  # Convert question to vector
    results = index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)

    for match in results["matches"]:
        print(f"Score: {match['score']}")
        print(f"Source: {match['metadata']['source']}")
        print("Content Snippet:", match["metadata"].get("text", "No text available"))
        print("-" * 50)
    return results

# Run the test query
query_pinecone("""L'ordonnance de protection est délivrée par le juge, saisi par la personne en
danger, si besoin assistée, ou, avec l'accord de celle-ci, par le ministère
public.""")

Score: 0.954907954
Source: ./livre_ier\titre_v\chapitre_iv\article_200.md
Content Snippet: No text available
--------------------------------------------------
Score: 0.951700509
Source: ./livre_iii\titre_xi\chapitre_iii\section_3\article_1963.md
Content Snippet: No text available
--------------------------------------------------
Score: 0.950976193
Source: ./livre_iii\titre_viii\chapitre_ii\section_2\article_1760.md
Content Snippet: No text available
--------------------------------------------------
Score: 0.950892806
Source: ./livre_iii\titre_vi\chapitre_iv\section_2\article_1621.md
Content Snippet: No text available
--------------------------------------------------
Score: 0.950689256
Source: ./livre_iii\titre_xiii\chapitre_iv\article_2007.md
Content Snippet: No text available
--------------------------------------------------


{'matches': [{'id': './livre_ier\\titre_v\\chapitre_iv\\article_200.md-0',
              'metadata': {'source': './livre_ier\\titre_v\\chapitre_iv\\article_200.md'},
              'score': 0.954907954,
              'values': []},
             {'id': './livre_iii\\titre_xi\\chapitre_iii\\section_3\\article_1963.md-0',
              'metadata': {'source': './livre_iii\\titre_xi\\chapitre_iii\\section_3\\article_1963.md'},
              'score': 0.951700509,
              'values': []},
             {'id': './livre_iii\\titre_viii\\chapitre_ii\\section_2\\article_1760.md-0',
              'metadata': {'source': './livre_iii\\titre_viii\\chapitre_ii\\section_2\\article_1760.md'},
              'score': 0.950976193,
              'values': []},
             {'id': './livre_iii\\titre_vi\\chapitre_iv\\section_2\\article_1621.md-0',
              'metadata': {'source': './livre_iii\\titre_vi\\chapitre_iv\\section_2\\article_1621.md'},
              'score': 0.950892806,
              'values