In [None]:
dataset_creation = False #GPU mode higly recommended

## install libs

In [None]:
!pip install datasets
!pip install langchain
!pip install langchain-groq
!pip install -U sentence-transformers #check that session is in GPU mode
!pip install faiss-cpu



## define hash function

In [None]:
import hashlib

def sha1_digest(input_string):
    # Convert the input string to bytes since hashlib works with bytes
    input_bytes = input_string.encode('utf-8')

    # Create a new SHA-1 hash object
    sha1 = hashlib.sha1()

    # Update the hash object with the input bytes
    sha1.update(input_bytes)

    # Get the hexadecimal digest of the hash
    sha1_hex_digest = sha1.hexdigest()

    return sha1_hex_digest

# Example usage:
input_string = "Hello, World!"
print("SHA-1 digest:", sha1_digest(input_string))

SHA-1 digest: 0a0a9f2a6772942557ab5355d76af442f8f65e01


## load dataset

In [None]:
from datasets import load_dataset

if dataset_creation:
  declaration_ds = load_dataset("the-french-artist/hatvp_declarations_xml_plus_markdown", split='train')

In [None]:
if dataset_creation:
  declaration_df = declaration_ds.to_pandas()
  declaration_df.head()

## create chunk DF

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

if dataset_creation:
  text_splitter = RecursiveCharacterTextSplitter(
      # Set a really small chunk size, just to show.
      # separators = ["\n\n", "\n", " ", "", "\t"],
      separators = [""],
      keep_separator = False,
      chunk_size=1000,
      chunk_overlap=20,
      length_function=len,
      is_separator_regex=False,
  )

In [None]:
from tqdm.auto import tqdm

if dataset_creation:
  chunky_list = []
  for declaration_markdown, markdown_sha1 in tqdm(zip(declaration_df.declaration_markdown.to_list(), declaration_df.markdown_sha1.to_list()), total=len(declaration_df)):
    chunk_docs = text_splitter.create_documents([declaration_markdown])
    for curr_doc in chunk_docs:
      curr_doc = curr_doc.page_content #extract text from page for long term storage
      chunky_list.append([markdown_sha1, sha1_digest(curr_doc), curr_doc])

In [None]:
# len(chunky_list)

In [None]:
import pandas as pd
if dataset_creation:
  chunky_df = pd.DataFrame(chunky_list, columns=['markdown_sha1', 'markdown_chunk_sha1', 'markdown_chunk'])

In [None]:
# chunky_df

### check largest document  

We find a document with many chunks, we check to see if it is really that long (it is!)

In [None]:
if dataset_creation:
  chunky_df.markdown_sha1.value_counts().to_frame()

In [None]:
from IPython.display import display_markdown

if dataset_creation:
  markdown_declaration_sample = declaration_df[declaration_df.markdown_sha1 == "01e24412555ba4f884c603b7ca8845efc9bf8dd1"].declaration_markdown.to_list()[0]

  display_markdown(
      markdown_declaration_sample,
      raw=True
  )

## Compute embeddings

Choose an appropriate model from here :  

https://www.sbert.net/docs/pretrained_models.html

In [None]:
from sentence_transformers import SentenceTransformer, util

if dataset_creation:
  #Load the model
  model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')

  doc_emb = model.encode(chunky_df.markdown_chunk.to_list())

In [None]:
if dataset_creation:
  chunky_df['markdown_embedding'] = list(doc_emb)

In [None]:
# chunky_df

## Save dataset to the HUB

In [None]:
from huggingface_hub import login
from google.colab import userdata


login(userdata.get('HF_TOKEN'))

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from datasets import Dataset

if dataset_creation:
  chunky_ds = Dataset.from_pandas(chunky_df)

In [None]:
# chunky_ds

In [None]:
if dataset_creation:
  chunky_ds.push_to_hub("the-french-artist/hatvp_declarations_markdown_chunks_embeds")

## Load dataset from HUB

In [None]:
from datasets import load_dataset
chunky_ds = load_dataset("the-french-artist/hatvp_declarations_markdown_chunks_embeds", split='train')

Downloading readme:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/224M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/57598 [00:00<?, ? examples/s]

In [None]:
chunky_ds

Dataset({
    features: ['markdown_sha1', 'markdown_chunk_sha1', 'markdown_chunk', 'markdown_embedding'],
    num_rows: 57598
})

In [None]:
chunky_ds.add_faiss_index(column='markdown_embedding')

  0%|          | 0/58 [00:00<?, ?it/s]

Dataset({
    features: ['markdown_sha1', 'markdown_chunk_sha1', 'markdown_chunk', 'markdown_embedding'],
    num_rows: 57598
})

In [None]:
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')

query = 'Quel est le salaire de Damien Abad enqualité de Député en 2019?'
query_embed = model.encode([query])
query_embed.shape

(1, 768)

In [None]:
scores, retrieved_examples = chunky_ds.get_nearest_examples('markdown_embedding', query_embed, k=10)

In [None]:
retrieved_examples['markdown_chunk']

['e celle versée au titre du premier mandat juin 2012 à juin 2017.',
 "e de l Éducation Nationale à compter du 01 09 2022 à ce jour je ne connais pas encore le montant de la retraite qui me sera versée à partir du 01 09 2022, **Conserved:** None, **Description:** Coordonatrice mission de lutte contre le décrochage scolaire, **Employer Name:** Éducation Nationale fonctionnaire, **Start Date:** 01/2015, **End Date:** 06/2020\n  - **Remuneration Type:** Net, **Year:** [2015, 2016, 2017, 2018, 2019, 2020], **Amount:** [34800.0, 34800.0, 34800.0, 34800.0, 34800.0, 34800.0]\n\n## Consultant Activities\n- **ID:** CREATION, **Label:** None, **Comment:** Néant, **Employer:** Néant, **Description:** Néant, **Start Date:** 01/2015, **End Date:** 06/2020, **Remuneration Type:** Net, **Amount:** None, **Year:** None\n\n## Volunteer Functions\n\n## Elective Mandates\n\n## Spouse's Professional Activities\n\n## Directing Participation\n- **Participation ID:** CREATION, **Label:** None, **Comment:** N

In [None]:
# length of 10 best samples is the same as the average length of a complete declaration in Markdown format...
len(''.join(retrieved_examples['markdown_chunk']))
# ...and we don't even find the important thing on there !

6213

In [None]:
def perform_query(query, n_samples=3):
  query_embed = model.encode([query])
  scores, retrieved_examples = chunky_ds.get_nearest_examples('markdown_embedding', query_embed, k=n_samples)
  return retrieved_examples['markdown_chunk']

In [None]:
perform_query("Qui est Damien Abad?")

['eants au titre de la CIVIS.',
 'à compter de la date de ce même arrêté.',
 'e celle versée au titre du premier mandat juin 2012 à juin 2017.']

In [None]:
# We notice a very very bad performance due to chunking
# we will make a second dataset that is a simple embedding of a complete markdown file
# and perform retrieval on complete declarations to see if we get better results

Part 2 is here:

https://colab.research.google.com/drive/1xjHAfW7IG5CO-yylv2a7Br1rMdmXa4It#scrollTo=ZeymZd6evIkB  