# HATVP RAG  

This version is the demo notebook: we use the datasets made in notebooks 1 to 6 to showcase the inference capabilities of our RAG system.  

## install libs

In [11]:
!pip install datasets
!pip install langchain
!pip install langchain-groq
!pip install openai
!pip install faiss-cpu
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


## define functions and load dataset

In [2]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')


In [12]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def truncate_text_to_stay_under_openai_embedding_limit(input_text):

  backup_input_text = input_text

  openai_embed_limit = 8192
  delta = num_tokens_from_string(input_text, "cl100k_base") - openai_embed_limit
  while delta > 0:
    input_text = input_text[:-int(delta*2)] #we add factor 2 to speed up the process
    delta = num_tokens_from_string(input_text, "cl100k_base") - openai_embed_limit

  if len(input_text) > 0:
    return input_text
  else:
    return backup_input_text[:8000]

In [13]:
from openai import OpenAI
client = OpenAI()
from tqdm.auto import tqdm
tqdm.pandas()


def get_embedding(text, model="text-embedding-3-large"): #this is where we switch to the large embeding model - WARNING: 10x more expensive
   text = text.replace("\n", " ")
   text = truncate_text_to_stay_under_openai_embedding_limit(text)
   return client.embeddings.create(input = [text], model=model).data[0].embedding

# get_embedding('this is a test')

In [14]:
from datasets import load_dataset
embed_ds = load_dataset("the-french-artist/hatvp_declarations_text_index_embeds", split='train')

In [15]:
embed_ds.add_faiss_index(column='text_index_embedding')

  0%|          | 0/11 [00:00<?, ?it/s]

Dataset({
    features: ['xml_sha1', 'declaration_xml', 'declaration_json', 'text_index', 'text_index_embedding'],
    num_rows: 10944
})

In [16]:
import numpy as np

def perform_query(query, n_samples=1):
  # query_embed = model.encode([query])
  query_embed = np.array(get_embedding(query.lower()))
  scores, retrieved_examples = embed_ds.get_nearest_examples('text_index_embedding', query_embed, k=n_samples)
  return retrieved_examples['declaration_json']

In [17]:
import json

def get_name_surname_from_str_declaration(input_str_json):
  parsed_json = json.loads(input_str_json)
  return parsed_json['declaration']['general']['declarant']['nom'], parsed_json['declaration']['general']['declarant']['prenom']

In [18]:
import os
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq


def get_answer_to_question(question, llm_to_use):

  system = """You are an assistant for question-answering tasks.
  Use the following pieces of retrieved context to answer the question.
  If you don't know the answer, just say that you don't know.
  Use three sentences maximum and keep the answer concise.
  """
  human = "{text}"
  prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

  results = perform_query(question, 1)
  context = ''.join(results) #concatenate top 5 results into a context
  actual_prompt = f"""
  Question: {question}
  Context: {context}
  Answer:
  """

  # print(context)
  for result in results:
    print(get_name_surname_from_str_declaration(result))
  chain = prompt | llm_to_use | StrOutputParser()
  return chain.invoke({"text": actual_prompt})

## ask questions here

In [22]:
llm_llama3_70B = ChatGroq(temperature=0, model_name="llama3-70b-8192")
question = "Quel est le salaire de Damien Abad en 2019?"

print(get_answer_to_question(question, llm_llama3_70B))

('ABAD', 'DAMIEN')
The salary of Damien Abad in 2019 was 71,105 euros.


We have checked and the response:  
`The salary of Damien Abad in 2019 was 71,105 euros.`  
is indeed true, he perceived a 71,105€ salary as depute in 2019.  