https://www.legifrance.gouv.fr/liste/code?etatTexte=VIGUEUR

In [243]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, PodSpec
from tqdm.auto import tqdm
from uuid import uuid4
from dotenv import load_dotenv
import os

# Mariage & Divorce: split docs by articles

In [281]:
# Load the .env file
load_dotenv()

# Use the environment variables
pinecone_api_key = os.getenv('PINECONE_API_KEY')
bearer = os.getenv('Bearer')

In [228]:
loader = PyPDFLoader("/Users/antoinebertin/Documents/jedha/full_stack/Extra/RAG/code_civil_mariage_divorce.pdf")
text_splitter = CharacterTextSplitter(
    separator='(?= Article \d+)',
    chunk_size=500,
    chunk_overlap=30,
    length_function=len,
    is_separator_regex=True,
)

In [229]:
docs_from_pdf = loader.load_and_split(text_splitter=text_splitter)

Created a chunk of size 810, which is longer than the specified 500
Created a chunk of size 760, which is longer than the specified 500
Created a chunk of size 773, which is longer than the specified 500
Created a chunk of size 745, which is longer than the specified 500
Created a chunk of size 593, which is longer than the specified 500
Created a chunk of size 777, which is longer than the specified 500
Created a chunk of size 1311, which is longer than the specified 500
Created a chunk of size 1182, which is longer than the specified 500
Created a chunk of size 1217, which is longer than the specified 500
Created a chunk of size 1331, which is longer than the specified 500
Created a chunk of size 1346, which is longer than the specified 500
Created a chunk of size 821, which is longer than the specified 500
Created a chunk of size 555, which is longer than the specified 500
Created a chunk of size 1134, which is longer than the specified 500
Created a chunk of size 802, which is long

# Embeddings prep

In [230]:
embeddings = HuggingFaceEmbeddings()

In [234]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [235]:
texts_to_index = [doc.page_content for doc in docs_from_pdf]

In [237]:
texts_to_index[0]

"Livre Ier : Des personnesTitre V : Du mariageChapitre Ier : Des qualités et conditions requises pour pouvoircontracter mariageArticle 143 Le mariage est contracté par deux personnes de sexe différent ou de même sexe. (?= Article \\d+) Article 144 Le mariage ne peut être contracté avant dix-huit ans révolus. (?= Article \\d+) Article 145 Néanmoins, il est loisible au procureur de la République du lieu de célébration du mariage d'accorder desdispenses d'âge pour des motifs graves."

# init pinecone index

In [239]:
pc = Pinecone(api_key=pinecone_api_key)

In [241]:
pc.create_index(
  name="civil",
  dimension=768,
  metric="cosine",
  spec=PodSpec(
    environment="gcp-starter"
  )
)

In [242]:
index = pc.Index("civil")

# upsert

In [244]:
batch_size = 64  # You can adjust the batch size based on your preference

for i in tqdm(range(0, len(texts_to_index), batch_size)):
    # Define the end of the current batch
    i_end = min(i + batch_size, len(texts_to_index))
    # Extract the batch of texts
    batch_texts = texts_to_index[i:i_end]

    # Generate embeddings for the batch of texts
    # Assuming `embed_documents` function exists and takes a list of texts, returning a list of embeddings
    batch_embeddings = embeddings.embed_documents(batch_texts)

    # Prepare metadata, including the original text for each embedding
    # This assumes you want to store the original text alongside the embedding for retrieval
    batch_meta = [{'text': text} for text in batch_texts]

    # Generate unique IDs for each embedding in the batch
    ids = [str(uuid4()) for _ in range(i, i_end)]

    # Prepare the data for upsert to Pinecone
    # Each entry in `to_upsert` is a tuple consisting of an ID, an embedding, and metadata
    to_upsert = [(id, emb, meta) for id, emb, meta in zip(ids, batch_embeddings, batch_meta)]

    # Upsert the batch of embeddings and metadata to Pinecone
    index.upsert(vectors=to_upsert)

  0%|          | 0/3 [00:00<?, ?it/s]

AttributeError: 'Index' object has no attribute 'describe_index'

# Query

In [266]:
query_text = "mineur"
query_embedding = embeddings.embed_documents([query_text])[0]

In [267]:
len(query_embedding)

768

In [268]:
query_results = index.query(
    vector=query_embedding,
    top_k=2,
    include_values=True,
    include_metadata=True
)

In [269]:
for result in query_results['matches']:
    print(f"ID: {result['id']}, Score: {result['score']}")
    if 'metadata' in result:
        print(f"Text: {result['metadata']['text']}\n")


ID: 06ef1385-0bb9-40d6-a3d6-6bf0041ac41d, Score: 0.277391106
Text: Article 148 Les mineurs ne peuvent contracter mariage sans le consentement de leurs père et mère ; en cas dedissentiment entre le père et la mère, ce partage emporte consentement.

ID: 1e127601-3edc-49b8-abfb-ee5ff84a1144, Score: 0.233539179
Text: S'il n'y a ni père, ni mère, ni aïeuls, ni aïeules, ou s'ils se trouvent tous dans l'impossibilité de manifesterleur volonté, les mineurs de dix-huit ans ne peuvent contracter mariage sans le consentement du conseil defamille.



# test mistral instruct

In [282]:
import requests

API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1"
headers = {"Authorization": bearer}
def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "<s>[INST] What is Paris? [/INST]",
})

In [279]:
output

[{'generated_text': "<s>[INST] What is Paris? [/INST] Paris is the capital and most populous city of France. It is located in the north-central region of the country along the Yonne River, where it meets the Seine River. The city is known for its rich history, art, culture, and cuisine, and has been one of the world's leading centers of commerce, fashion, and gastronomy for hundreds of years. Paris is also known as the City of Love, due to its reputation as a romantic destination and the"}]