https://www.legifrance.gouv.fr/liste/code?etatTexte=VIGUEUR

# Mariage & Divorce: split docs by articles

In [132]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, PodSpec

In [153]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Use the environment variables
pinecone_api_key = os.getenv('PINECONE_API_KEY')

In [140]:
loader = PyPDFLoader("/Users/antoinebertin/Documents/jedha/full_stack/Extra/RAG/code_civil_mariage_divorce.pdf")
text_splitter = CharacterTextSplitter(
    separator='(?= Article \d+)',
    chunk_size=500,
    chunk_overlap=30,
    length_function=len,
    is_separator_regex=True,
)

In [141]:
docs_from_pdf = loader.load_and_split(text_splitter=text_splitter)

Created a chunk of size 810, which is longer than the specified 500
Created a chunk of size 760, which is longer than the specified 500
Created a chunk of size 773, which is longer than the specified 500
Created a chunk of size 745, which is longer than the specified 500
Created a chunk of size 593, which is longer than the specified 500
Created a chunk of size 777, which is longer than the specified 500
Created a chunk of size 1311, which is longer than the specified 500
Created a chunk of size 1182, which is longer than the specified 500
Created a chunk of size 1217, which is longer than the specified 500
Created a chunk of size 1331, which is longer than the specified 500
Created a chunk of size 1346, which is longer than the specified 500
Created a chunk of size 821, which is longer than the specified 500
Created a chunk of size 555, which is longer than the specified 500
Created a chunk of size 1134, which is longer than the specified 500
Created a chunk of size 802, which is long

# Embeddings & VDB

In [142]:
embeddings = HuggingFaceEmbeddings()

In [157]:
pc = Pinecone(api_key=pinecone_api_key)

In [137]:
pc.create_index(
  name="civil",
  dimension=768,
  metric="cosine",
  spec=PodSpec(
    environment="gcp-starter"
  )
)

In [159]:
# Initialize PineconeVectorStore
vectorstore = PineconeVectorStore(
    index_name="civil",
    embedding=embeddings,
    pinecone_api_key=pinecone_api_key
)

# uspsert

In [149]:
texts_to_index = [doc.page_content for doc in docs_from_pdf]

In [151]:
# Upsert embeddings
vectorstore.add_texts(
    texts_to_index,  # your list of texts
    batch_size=32,  # batch size for upsert
    async_req=True  # whether to make the request asynchronously
)

['0402fefd-a858-4725-bd4d-9e90685ee930',
 'bb1b2e1d-8f0f-40a8-b23b-05ffe34620ca',
 'd5c45e75-8a7c-438f-82a3-705f03cecdfc',
 'a9b05b11-354a-4290-9c70-824b42f332c3',
 '08576411-f356-424c-b8ae-f5d6496e5c92',
 '0f9cb73a-3643-487b-8974-d84861cb89bf',
 'e381ef5d-4f10-4aa3-bcb0-ebfbf2677100',
 '5fe4c515-eeab-4c0a-8ab8-746b547bd07d',
 '77e13e6b-4ddd-468a-9b71-a5755cdd2ecd',
 '50ee10e3-76fc-4c20-bf74-64c01daaef0a',
 '5a161ff4-fff7-46c8-8fd7-5f60be0c50f3',
 'e5adcf96-aac1-41c6-86e1-77dc951ac471',
 '0ee2fd50-aa7a-4cb0-b9eb-e912307551e0',
 '0a4d8c28-fe20-4c7a-810e-a8dd0491c3e7',
 '7b67dfd0-6f9b-4083-8d95-895d754785a4',
 'e1ab9131-ff52-4f48-a1fa-d73b93b2db1d',
 '455e375c-7ab4-4773-96b9-f4eeff60c531',
 '9bbac1b2-0d73-434f-b231-d3e3332a5e69',
 '2706ee4e-0a77-49de-8ee7-853906977f9e',
 '76d31d9e-04e1-4b3b-a5d2-ea2841c5cf16',
 '1dbaffe5-f099-41f5-b24f-49ad9723b8a3',
 '07b93168-b94e-44a4-8cc8-395e9a567066',
 '6dd3a523-c6b8-4ff0-ba2f-7d2916d0ad4c',
 '30ceb84f-0225-4442-bd53-0fe3038f077d',
 '5104d5dd-080a-

# search VDB