In [None]:
pip install llama_index qdrant_client langchain pymupdf replicate

In [None]:
import json
import re
import pandas as pd
from qdrant_client import QdrantClient
import fitz
import os
from llama_index import Document
from tqdm import tqdm
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index import SimpleDirectoryReader, VectorStoreIndex, ServiceContext, StorageContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import OpenAIEmbedding
from llama_index import load_index_from_storage
from llama_index.prompts import PromptTemplate
import random
import time

OPENAI_API_KEY="OPENAI_API_KEY"
REPLICATE_API_TOKEN="REPLICATE_API_TOKEN"


from llama_index.llms import OpenAI
from llama_index.llms.palm import PaLM
from llama_index.llms import Replicate

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN

TEMPERATURE = 0.2
MAX_TOKENS = 50
MAX_CONTEXT_WINDOW = 4096

llama2_13b_llm = Replicate(
    model="meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d",
    context_window=MAX_CONTEXT_WINDOW,
    temperature=TEMPERATURE

)

llama2_70b_llm = Replicate(
    model="meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
    context_window=MAX_CONTEXT_WINDOW,
    temperature=TEMPERATURE
)

gpt35_llm = OpenAI(
    model="gpt-3.5-turbo-16k",
    max_tokens=MAX_TOKENS,
    temperature=TEMPERATURE
)
gpt4_llm = OpenAI(
    model="gpt-4",
    max_tokens=MAX_TOKENS,
    temperature=TEMPERATURE
)

llms_list = [
    {"model_name" : "GPT 3.5", "license": "commercial","model_object": gpt35_llm},
    {"model_name" : "GPT 4", "license": "commercial","model_object": gpt4_llm},
    {"model_name" : "Llama2 13B", "license": "open-source","model_object": llama2_13b_llm},
    {"model_name" : "Llama2 70B", "license": "open-source","model_object": llama2_70b_llm},
]


In [None]:
embeddings_models = [
    {"model_name" : "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "license": "open-source"},
    {"model_name" : "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "license": "open-source"},
    {"model_name" : "text-embeddings-ada-002", "license": "commercial"},
]

# Loading Vade Mecum to Qdrant

In [None]:
with open(f'Vade_mecum_2023.pdf.json') as f:
    vade_mecum_2023 = json.load(f)

In [None]:
QDRANT_CLIENT_TOKEN = "QDRANT_CLIENT_TOKEN"
qdrant_client = QdrantClient(
    url="https://RAANDOM URL.us-east-1-0.aws.cloud.qdrant.io:6333",
    api_key=QDRANT_CLIENT_TOKEN,
)

In [None]:
vade_mecum_2023['17'].keys()

In [None]:
documents_list = []
for page_number, page_content in vade_mecum_2023.items():
    documents_list.append(
        Document(
            text=page_content["text"],
            metadata={
                'page_number': int(page_number),
                'chapter_title': page_content['chapter_title']
            }
        )
    )

In [None]:

def get_embedding_model(embedding_model_name):
    if embedding_model_name == "text-embeddings-ada-002":
        embed_model = open_ai_embeddings = OpenAIEmbedding(embed_batch_size=50,OPENAI_API_KEY=OPENAI_API_KEY)
    else:
        model_kwargs = {"device": "cpu"}
        encode_kwargs = {"normalize_embeddings": False}
        embed_model = LangchainEmbedding(
            HuggingFaceEmbeddings(
                model_name=embedding_model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
            )
        )
    return embed_model

def generate_index(documents, embedding_model_name):
    node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=64)
    embed_model = get_embedding_model(embedding_model_name)
    service_context = ServiceContext.from_defaults(
        node_parser=node_parser,
        embed_model=embed_model

    )
    qdrant_vector_store = QdrantVectorStore(
            client=qdrant_client, collection_name="rag_"+embedding_model_name.replace("-", "_").replace("/", "_")
        )

    storage_context = StorageContext.from_defaults(
                        vector_store=qdrant_vector_store,
                    )
    index = VectorStoreIndex.from_documents(
        documents, service_context=service_context, storage_context=storage_context, show_progress=True
    )

    index.storage_context.persist(
        persist_dir=f"indexes_metadata/" + "rag_"+ embedding_model_name,
    )
    return

In [None]:
for embedding_model in embeddings_models:
    print(embedding_model["model_name"])
    generate_index(documents_list, embedding_model["model_name"])

## Questions

In [None]:
with open(f'1A FASE OAB/37º EXAME DE ORDEM UNIFICADO TIPO 1 - BRANCA.pdf.json') as f:
    oab_37 = json.load(f)


In [None]:

def extract_text(oab_dict):
    LEFT_STR = ""
    RIGHT_STR = ""

    for item in oab_37:
        if item["metadata"]["page_number"] >= 3 and item["metadata"]["page_number"] <= 22:
            is_right_side = False
            for idx in range(4):
                if item["coordinates"][idx][0] > 300:
                    is_right_side = True
            if is_right_side:
                # print(item["coordinates"], item["text"])
                RIGHT_STR += item["text"]
                RIGHT_STR += "\n"
            else:
                # print(item["coordinates"], item["text"])
                LEFT_STR += item["text"]
                LEFT_STR += "\n"
    return LEFT_STR, RIGHT_STR

def extract_strings(text):
    def extract_between_markers(marker1, marker2, input_text):
        pattern = re.compile(f'{re.escape(marker1)}(.*?)\s{re.escape(marker2)}', re.DOTALL)
        matches = pattern.findall(input_text)
        return [match.strip() for match in matches][0]

    a_to_b = extract_between_markers('A)', 'B)', text)
    b_to_c = extract_between_markers('B)', 'C)', text)
    c_to_d = extract_between_markers('C)', 'D)', text)

    pattern_d = re.compile(r'D\)(.*)', re.DOTALL)
    match_d = pattern_d.search(text)
    after_d = match_d.group(1).strip() if match_d else ""

    return a_to_b, b_to_c, c_to_d, after_d

def get_questions_from_text(left_side_str, right_side_str):
    # Define a regular expression pattern to split the text into individual questions
    pattern = r'\n?(\d+)\n?(.*?)(?=\n\d+|$)'
    questions = {}
    # Use re.findall to find all matches in the text
    left_side_questions = re.findall(pattern, left_side_str.replace("XXXVII EXAME DE ORDEM UNIFICADO – TIPO 1 – BRANCA PROVA APLICADA EM 26/2/2023", ""), re.DOTALL)
    right_side_questions = re.findall(pattern, right_side_str.replace("XXXVII EXAME DE ORDEM UNIFICADO – TIPO 1 – BRANCA PROVA APLICADA EM 26/2/2023", ""), re.DOTALL)

    # Iterate through the questions and print them
    for question in left_side_questions:
        number, text = question
        question =  text.split("A)")[0]
        alternatives  = extract_strings(text)
        a_answer, b_answer, c_answer, d_answer = alternatives[0],alternatives[1],alternatives[2],alternatives[3]
        questions[int(number)] = {
        "question_text" : question,
         "a_answer" : a_answer,
         "b_answer" : b_answer,
         "c_answer" : c_answer,
         "d_answer" : d_answer,
        }

    for question in right_side_questions:
        number, text = question
        question =  text.split("A)")[0]
        if question != "":
            alternatives_text = text.split(f"{question}")[1]
            alternatives = extract_strings(alternatives_text)
            a_answer, b_answer, c_answer, d_answer = alternatives[0],alternatives[1],alternatives[2],alternatives[3]
            questions[int(number)] = {
            "question_text" : question,
             "a_answer" : a_answer,
             "b_answer" : b_answer,
             "c_answer" : c_answer,
             "d_answer" : d_answer,
            }
    return questions
def get_oab_df(oab_dict):
    left_str, right_str = extract_text(oab_dict)
    questions = get_questions_from_text(left_str, right_str)
    oab_df = pd.DataFrame(questions).T.reset_index().rename(columns={"index":"question_number"}).sort_values("question_number").reset_index(drop=True)
    return oab_df

In [None]:
oab_37_df = get_oab_df(oab_37)
oab_37_df.to_csv(f"oab_37_df.csv", sep=";", index=False)

In [None]:
oab_37_df = pd.read_csv("oab_37_df.csv", sep =";")
with open(f'1A FASE OAB/XXXVII EXAME UNIFICADO – GABARITOS.json') as f:
    answers_oab_37_dict = json.load(f)

In [None]:
oab_37_df

# Evaluating LLM Performance

In [None]:
def load_index_from_context(embedding_model_name):

    node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=64)
    embed_model = get_embedding_model(embedding_model_name)
    service_context = ServiceContext.from_defaults(
        node_parser=node_parser,
        embed_model=embed_model
    )
    qdrant_vector_store = QdrantVectorStore(
            client=qdrant_client, collection_name="rag_"+embedding_model_name.replace("-", "_").replace("/", "_")
        )

    storage_context = StorageContext.from_defaults(
                        vector_store=qdrant_vector_store,
                        persist_dir=f"indexes_metadata/rag_"+embedding_model_name
                    )

    index = load_index_from_storage(
        service_context=service_context, storage_context=storage_context
    )
    return index

In [None]:


def get_results(questions, answers, llm, retriever, n_chunks_to_use_rag=5,use_rag=True):

    qa_prompt_tmpl_str = """\
    You are an experienced brazilain lawyer and your job is to use the following brazilian laws and your current knowledge to answer multiple choice questions
    ---------------------
    BRAZILIAN LAWS:
    {vade_mecum_laws_str}
    ---------------------
    Given the brazilian laws mentioned above and your current knowledge, answer the multiple choice question below
    Question:
    {query_str}


    ONLY ANSWER THE CORRECT ALTERNATIVE BETWEEN A, B, C or D
    """

    prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
    response_with_answers_dict = {}
    for question_number in range(1,81):
        response_with_answers_dict[str(question_number)] = {
            "llm_response_text" : "",
            "answer": "",
        }
    with tqdm(total=len(questions)) as pbar:
      for _, question in questions.iterrows():
        query_str = f"""
        Question: {question["question_text"]}

        Alternatives:
        - A) {question["a_answer"]}
        - B) {question["b_answer"]}
        - C) {question["c_answer"]}
        - D) {question["d_answer"]}
        """
        if retriever is not None:
          vade_mecum_laws_nodes = retriever.retrieve(query_str)
          vade_mecum_laws_str = ""
          for node in vade_mecum_laws_nodes:
            vade_mecum_laws_str += node.text
            vade_mecum_laws_str += "\n"
        else:
          vade_mecum_laws_str = "NO CONTEXT PROVIDED"
        fmt_prompt = prompt_tmpl.format(
            vade_mecum_laws_str=vade_mecum_laws_str,
            query_str=query_str
        )
        n_tries = 0
        max_tries = 5

        while n_tries < max_tries and response_with_answers_dict[str(question["question_number"])]["llm_response_text"] == "":
            try:
                n_tries += 1
                result = llm["model_object"].complete(fmt_prompt)
                # Do something with the result if needed
                response_with_answers_dict[str(question["question_number"])]["llm_response_text"] = result.text
            except Exception as e:
                print(f"Error: {e}")
                time.sleep(5)  # Wait for 5 seconds before the next try
        if response_with_answers_dict[str(question["question_number"])]["llm_response_text"] == "":
          response_with_answers_dict[str(question["question_number"])]["llm_response_text"] = "FAIL"
        pbar.update(1)

    correct_answers = 0

    def get_answer_from_llm_response(llm_response_text):
      pattern = re.compile(r'\b([A-D])\)')
      if llm_response_text == "":
        return "FAIL"
      else:
        match = pattern.search(llm_response_text)
        if match:
            # print(f"{match.group(0).replace(')','')}")
            return match.group(0).replace(')','')
        else:
          return "FAIL"

    for question_number  in range(1,81):
      expected_answer = answers[str(question_number)]
      llm_answer = get_answer_from_llm_response(response_with_answers_dict[str(question_number)]["llm_response_text"])
      response_with_answers_dict[str(question_number)]["answer"]= llm_answer
      if expected_answer == "*":
        correct_answers +=1
      if expected_answer == llm_answer:
        correct_answers +=1

    score = (100 * correct_answers/80)
    return score, response_with_answers_dict


In [None]:
number_of_relevant_chunks_to_retrieve = [5]
results_records = []
results_columns = [
    "oab_edition_number",
    "llm_name",
    "llm_license",
    "embed_model_name",
    "embed_model_license",
    "n_chunks_retrieved_per_question",
    "score",
    "answers_dict",
]
for oab_df, answers_df,oab_edition in zip([oab_37_df], [answers_oab_37_dict],["37"]):
  for llm in llms_list:
    llm_baseline_score, llm_baseline_response_with_answers_dict = get_results(oab_df, answers_df, llm, None, n_chunks_to_use_rag=None,use_rag=False)
    results_records.append([
        oab_edition,
        llm["model_name"],
        llm["license"],
        None,
        None,
        None,
        llm_baseline_score,
        llm_baseline_response_with_answers_dict
    ])
    print(
      oab_edition,
      llm["model_name"],
      llm["license"],
      "NONE",
      "NONE",
      0,
      llm_baseline_score
    )
    for embed_model in embeddings_models:
      index = load_index_from_context(embed_model["model_name"])

      for top_k in number_of_relevant_chunks_to_retrieve:
          retriever = index.as_retriever(similarity_top_k=top_k)
          score, response_with_answers_dict = get_results(oab_df, answers_df, llm, retriever, n_chunks_to_use_rag=top_k,use_rag=True)
          print(
              oab_edition,
              llm["model_name"],
              llm["license"],
              embed_model["model_name"],
              embed_model["license"],
              top_k,
              score
          )

          results_records.append([
              oab_edition,
              llm["model_name"],
              llm["license"],
              embed_model["model_name"],
              embed_model["license"],
              top_k,
              score,
              response_with_answers_dict
              ]
results_df = pd.DataFrame(results_records,columns=results_columns)
      display(results_df)
              )

          results_df = pd.DataFrame(results_records,columns=results_columns)
          display(results_df)
          results_df.to_csv(f"partial_results.csv", sep =";", index=False)