# LlamaParse - Parsing Complex Documents

## Load and Parse PDFs


In [None]:
!pip install -qU llama-index llama-parse ragas

In [None]:
import os
import getpass

os.environ["LLAMA_CLOUD_API_KEY"] = getpass.getpass("LLamaParse API Key:")

LLamaParse API Key:··········


In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

OpenAI API Key:··········


In [None]:
import nest_asyncio

nest_asyncio.apply()

### LlamaParse Initialization


In [None]:
from llama_parse import LlamaParse

parser = LlamaParse(
    result_type="markdown",
    verbose=True,
    language="en",
    num_workers=2,
)

### Uploading Files

In [None]:
from google.colab import files

ships_manual = files.upload()

Saving Ships_3m_manual_04790.008D.pdf to Ships_3m_manual_04790.008D.pdf


### Parsing Our Files



In [None]:
documents = parser.load_data(["/content/Ships_3m_manual_04790.008D.pdf"])

Parsing files: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


## LlamaIndex Recursive Query Engine

In [None]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import MarkdownElementNodeParser

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")

node_parser = MarkdownElementNodeParser(llm=OpenAI(model="gpt-3.5-turbo"), num_workers=8)

In [None]:
nodes = node_parser.get_nodes_from_documents(documents=[documents[0]])

In [None]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [None]:
from llama_index.core import VectorStoreIndex

recursive_index = VectorStoreIndex(nodes=base_nodes+objects)

### Recursive Query Engine

In [None]:
!pip install -qU llama-index-postprocessor-flag-embedding-reranker git+https://github.com/FlagOpen/FlagEmbedding.git

In [None]:
ships_manual_nodes = node_parser.get_nodes_from_documents(documents=[documents[0]])

In [None]:
ships_base_nodes, ships_objects = node_parser.get_nodes_and_objects(ships_manual_nodes)

In [None]:
ships_recursive_index = VectorStoreIndex(nodes=ships_base_nodes + ships_objects, include_content=True)

In [None]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

ships_recursive_query_engine = ships_recursive_index.as_query_engine(
    similarity_top_k=15,
    node_postprocessors=[reranker],
    verbose=True
)

In [None]:
query = "What is the alteration category for prefix TD?"
response = ships_recursive_query_engine.query(query)

[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1224_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the alteration category for prefix TD?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1348_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the alteration category for prefix TD?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1228_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the alteration category for prefix TD?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1352_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the alteration category for prefix TD?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_944_table: TextNode
[0m[1;3;3

In [None]:
print(response)

The alteration category for prefix TD is Technical Directive.


In [None]:
query = "What is the telephone number for NSDSA (0310)?"
response = ships_recursive_query_engine.query(query)

[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1292_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the telephone number for NSDSA (0310)?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1296_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the telephone number for NSDSA (0310)?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1290_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the telephone number for NSDSA (0310)?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1284_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the telephone number for NSDSA (0310)?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1280_table: TextNode
[0m[1;3;

In [None]:
print(response)

The telephone number for NSDSA (0310) is 805-228-0777.


In [None]:
query = "What is the planned action for job catalog DXCN D701?"
response = ships_recursive_query_engine.query(query)

[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1236_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the planned action for job catalog DXCN D701?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1050_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the planned action for job catalog DXCN D701?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1156_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the planned action for job catalog DXCN D701?
[0m

In [None]:
print(response)

Repeat: The planned action for job catalog DXCN D701 is "ESM ALIGNMENT".


In [None]:
query = "What is the repair code for Non-Destructive Testing?"
response = ships_recursive_query_engine.query(query)

[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1024_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the repair code for Non-Destructive Testing?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1036_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the repair code for Non-Destructive Testing?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1040_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the repair code for Non-Destructive Testing?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1038_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the repair code for Non-Destructive Testing?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1016_ta

In [None]:
print(response)

The repair code for Non-Destructive Testing is 93A.


## **RAGAS Evaluation**

In [None]:
!pip install langchain pypdf

In [None]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./Ships_3m_manual_04790.008D.pdf")
docs = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 50
)

documents = text_splitter.split_documents(docs)

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.with_openai()

testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})


  generator = TestsetGenerator.with_openai()


embedding nodes:   0%|          | 0/2304 [00:00<?, ?it/s]



Generating:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
test_df = testset.to_pandas()

In [None]:
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How are conditional questions used in the mate...,[LWC/AWC is to feedback the material condition...,Conditional questions are used in the material...,simple,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
1,How does RCM provide the maintenance engineeri...,"[maintaining ships, aircraft, and infrastructu...",RCM provides the maintenance engineering princ...,simple,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
2,What are some difficulties or deficiencies tha...,[5. Description of difficulties with or defici...,,simple,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
3,What is the purpose of work requests in the ex...,[M reporting. This program was established to...,The purpose of work requests in the execution ...,simple,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
4,What is the purpose of NAVSEAINST 4790.8D?,[NAVSEAINST 4790.8D \n ...,,simple,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
5,What feedback does the CSMP AWR provide for th...,[LWC/AWC is to feedback the material condition...,The CSMP AWR provides feedback for the materia...,reasoning,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
6,What methodology does CBM use to optimize life...,"[maintaining ships, aircraft, and infrastructu...",CBM uses RCM methodology to optimize life cycl...,reasoning,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
7,What is the maximum timeframe for distributing...,"[response if a message is not received, to all...",30,multi_context,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
8,How does CBM use RC M methodology to optimize ...,"[maintaining ships, aircraft, and infrastructu...",CBM uses RCM methodology to optimize life cycl...,multi_context,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True
9,What is the purpose of the On-line Systems Tra...,[3. Uses of the Data ...........................,,multi_context,[{'source': './Ships_3m_manual_04790.008D.pdf'...,True


In [None]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [None]:
answers = []
contexts = []
count = 0
for question in test_questions:
  response = ships_recursive_query_engine.query(question)
  answers.append(response.response)
  count += 1
  for node_with_score in response.source_nodes:
    node = node_with_score.node
    if len(contexts) < len(answers):
        contexts.append([node.text])

[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_988_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What are some difficulties or deficiencies that can arise with maintenance documentation?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_230_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the purpose of NAVSEAINST 4790.8D?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_1132_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the purpose of NAVSEAINST 4790.8D?
[0m[1;3;38;2;11;159;203mRetrieval entering id_de32b2bf-5e2c-4949-9a0b-9d8184012705_228_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the maximum timeframe for distributing revised PMS documentation to affected users after receiving an Urgent FBR response m

In [None]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [None]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]



In [None]:
results

{'faithfulness': 0.8095, 'answer_relevancy': 0.9128, 'context_recall': 0.7111, 'context_precision': 0.7000, 'answer_correctness': 0.6355}

In [None]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How are conditional questions used in the mate...,Conditional questions in the material conditio...,[---\n NAVSEAINST 4790.8D\n\nSECTION II ...,Conditional questions are used in the material...,1.0,0.992924,1.0,1.0,0.528552
1,How does RCM provide the maintenance engineeri...,RCM provides the maintenance engineering princ...,[PMS provides:\n\n2-1\n---\n NAVSEAINST 4790.8...,RCM provides the maintenance engineering princ...,0.285714,0.936937,1.0,1.0,0.424281
2,What are some difficulties or deficiencies tha...,Difficulties or deficiencies that can arise wi...,[2-18\n---\n NAVSEAINST 4790.8D\n\n17 Jun 2021...,,1.0,0.994747,0.4,0.0,0.176657
3,What is the purpose of work requests in the ex...,Work requests are essential in the execution o...,[All documentation will be validated prior to ...,The purpose of work requests in the execution ...,1.0,0.913868,1.0,1.0,0.73075
4,What is the purpose of NAVSEAINST 4790.8D?,The purpose of NAVSEAINST 4790.8D is to provid...,[---\n NAVSEAINST 4790.8D],,1.0,1.0,0.0,0.0,0.185184
5,What feedback does the CSMP AWR provide for th...,The CSMP AWR provides feedback on the material...,[1. Purpose\n\nThe purpose of this chapter is ...,The CSMP AWR provides feedback for the materia...,1.0,0.940376,1.0,1.0,0.729948
6,What methodology does CBM use to optimize life...,CBM methodology utilizes Reliability-Centered ...,[1. Purpose\nThis chapter provides an overview...,CBM uses RCM methodology to optimize life cycl...,1.0,0.958405,1.0,1.0,0.994905
7,What is the maximum timeframe for distributing...,30 calendar days,[(1) through 6.b.(4). (See Appendix J for addi...,30,0.0,0.831032,0.0,1.0,0.959
8,How does CBM use RC M methodology to optimize ...,Repeat.,[2. Training/Qualifications - 3-M Personnel Qu...,CBM uses RCM methodology to optimize life cycl...,,0.80805,1.0,1.0,0.686934
9,What is the purpose of the On-line Systems Tra...,Repeat the original answer.,[---\n NAVSEAINST 4790.8D\n\n(Force Level) equ...,,1.0,0.751274,,0.0,0.938397





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

